/***************************************************************************************************
 * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 *modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *notice, this list of conditions and the following disclaimer in the
 *documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its
 *contributors may be used to endorse or promote products derived from this
 *software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
 *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
 *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*
  \file
  \brief Matrix classes with value semantics.
*/

#pragma once

#include <iosfwd>
#include <cmath>

#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/coord.h"
#include "cutlass/fast_math.h"
#include "cutlass/layout/matrix.h"

namespace cutlass {

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Primary template with partial specializations to follow
template <typename Element, int Rows, int Columns>
struct Matrix;

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 1-by-2 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 1, 2> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 1;

    /// Number of columns in matrix
    static int const kColumns = 2;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 2;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 1-by-2 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1) {
        data[0] = _0_0;
        data[1] = _0_1;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> transpose() const {
        Matrix<Element, 2, 1> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> row(int i) const { return slice_1x2(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 2> const& v, int i = 0) {
        return set_slice_1x2(v, i, 0);
    }

    /// Forms a 1-by-2 matrix by horizontally concatenating an Element with an
    /// Element
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Element lhs, Element rhs) { return Matrix(lhs, rhs); }

    /// Concatenates this matrix with a an Element to form a 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> hcat(Element rhs) const {
        return Matrix<Element, 1, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 1-by-2 matrix to form a 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> hcat(Matrix<Element, 1, 2> const& rhs) const {
        return Matrix<Element, 1, 4>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 1-by-2 matrix to form a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> vcat(Matrix<Element, 1, 2> const& rhs) const {
        return Matrix<Element, 2, 2>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-2 matrix to form a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> vcat(Matrix<Element, 2, 2> const& rhs) const {
        return Matrix<Element, 3, 2>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-2 matrix to form a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> vcat(Matrix<Element, 3, 2> const& rhs) const {
        return Matrix<Element, 4, 2>::vcat(*this, rhs);
    }

    /// Elementwise add operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];

        return result;
    }

    /// Elementwise add operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];

        return *this;
    }

    /// Elementwise subtract operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];

        return result;
    }

    /// Elementwise subtract operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];

        return *this;
    }

    /// Elementwise multiply operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];

        return result;
    }

    /// Scalar multiply operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;

        return result;
    }

    /// Scalar multiply operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];

        return result;
    }

    /// Scalar divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;

        return result;
    }

    /// Scalar divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (1-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];

        return m;
    }

    /// Matrix product of size 1-by-1-by-2
    CUTLASS_HOST_DEVICE
    Element product(Matrix<Element, 2, 1> const& rhs,
                    Element accum = Element()) const {
        // k=0
        accum += data[0] * rhs.data[0];

        // k=1
        accum += data[1] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 1-by-1-by-2
    CUTLASS_HOST_DEVICE
    Element operator*(Matrix<Element, 2, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> product(
            Matrix<Element, 2, 2> const& rhs,
            Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 1-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> operator*(Matrix<Element, 2, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 2, 2> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 1-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> product(
            Matrix<Element, 2, 3> const& rhs,
            Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 1-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> operator*(Matrix<Element, 2, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> product(
            Matrix<Element, 2, 4> const& rhs,
            Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 1-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> operator*(Matrix<Element, 2, 4> const& rhs) const {
        return product(rhs);
    }

    /// Dot product of vectors with extent 2
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 2, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        return accum;
    }

    /// Dot product of vectors with extent 2
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 2> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }
};

/// Template alias for 1-by-2 matrix
template <typename Element>
using Matrix1x2 = Matrix<Element, 1, 2>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix1x2<Element> make_Matrix1x2(Element _0_0,
                                                      Element _0_1) {
    return Matrix1x2<Element>(_0_0, _0_1);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 1-by-3 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 1, 3> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 1;

    /// Number of columns in matrix
    static int const kColumns = 3;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 3;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 1-by-3 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> transpose() const {
        Matrix<Element, 3, 1> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];
        mt.data[2] = data[2];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> row(int i) const { return slice_1x3(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 3> const& v, int i = 0) {
        return set_slice_1x3(v, i, 0);
    }

    /// Forms a 1-by-3 matrix by horizontally concatenating an Element with a
    /// 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Element lhs, Matrix<Element, 1, 2> const& rhs) {
        return Matrix(lhs, rhs.at(0, 0), rhs.at(0, 1));
    }

    /// Forms a 1-by-3 matrix by horizontally concatenating a 1-by-2 matrix with
    /// an Element
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 1, 2> const& lhs, Element rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs);
    }

    /// Concatenates this matrix with a an Element to form a 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> hcat(Element rhs) const {
        return Matrix<Element, 1, 4>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 1-by-3 matrix to form a 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> vcat(Matrix<Element, 1, 3> const& rhs) const {
        return Matrix<Element, 2, 3>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-3 matrix to form a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> vcat(Matrix<Element, 2, 3> const& rhs) const {
        return Matrix<Element, 3, 3>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-3 matrix to form a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> vcat(Matrix<Element, 3, 3> const& rhs) const {
        return Matrix<Element, 4, 3>::vcat(*this, rhs);
    }

    /// Elementwise add operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];

        return result;
    }

    /// Elementwise add operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];

        return *this;
    }

    /// Elementwise subtract operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];

        return result;
    }

    /// Elementwise subtract operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];

        return *this;
    }

    /// Elementwise multiply operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];

        return result;
    }

    /// Scalar multiply operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;

        return result;
    }

    /// Scalar multiply operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];

        return result;
    }

    /// Scalar divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;

        return result;
    }

    /// Scalar divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (1-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];

        return m;
    }

    /// Matrix product of size 1-by-1-by-3
    CUTLASS_HOST_DEVICE
    Element product(Matrix<Element, 3, 1> const& rhs,
                    Element accum = Element()) const {
        // k=0
        accum += data[0] * rhs.data[0];

        // k=1
        accum += data[1] * rhs.data[1];

        // k=2
        accum += data[2] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 1-by-1-by-3
    CUTLASS_HOST_DEVICE
    Element operator*(Matrix<Element, 3, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> product(
            Matrix<Element, 3, 2> const& rhs,
            Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 1-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> operator*(Matrix<Element, 3, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> product(
            Matrix<Element, 3, 3> const& rhs,
            Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];

        return accum;
    }

    /// Matrix product of size 1-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> operator*(Matrix<Element, 3, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 3, 3> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 1-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> product(
            Matrix<Element, 3, 4> const& rhs,
            Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 1-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> operator*(Matrix<Element, 3, 4> const& rhs) const {
        return product(rhs);
    }

    /// Dot product of vectors with extent 3
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 3, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        return accum;
    }

    /// Dot product of vectors with extent 3
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 3> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }

    /// Cross product
    CUTLASS_HOST_DEVICE
    Matrix cross(Matrix const& rhs) const {
        return Matrix(data[1] * rhs.data[2] - data[2] * rhs.data[1],
                      data[0] * rhs.data[2] - data[2] * rhs.data[1],
                      data[0] * rhs.data[1] - data[1] * rhs.data[0]);
    }
};

/// Template alias for 1-by-3 matrix
template <typename Element>
using Matrix1x3 = Matrix<Element, 1, 3>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix1x3<Element> make_Matrix1x3(Element _0_0,
                                                      Element _0_1,
                                                      Element _0_2) {
    return Matrix1x3<Element>(_0_0, _0_1, _0_2);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 1-by-4 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 1, 4> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 1;

    /// Number of columns in matrix
    static int const kColumns = 4;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 4;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 1-by-4 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _0_3) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _0_3;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> transpose() const {
        Matrix<Element, 4, 1> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];
        mt.data[2] = data[2];
        mt.data[3] = data[3];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 1 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
        Matrix<Element, 1, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x4(Matrix<Element, 1, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> row(int i) const { return slice_1x4(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 4> const& v, int i = 0) {
        return set_slice_1x4(v, i, 0);
    }

    /// Forms a 1-by-4 matrix by horizontally concatenating an Element with a
    /// 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Element lhs, Matrix<Element, 1, 3> const& rhs) {
        return Matrix(lhs, rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2));
    }

    /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-2 matrix with
    /// a 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 1, 2> const& lhs,
                       Matrix<Element, 1, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1));
    }

    /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-3 matrix with
    /// an Element
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 1, 3> const& lhs, Element rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs);
    }

    /// Concatenates this matrix with a a 1-by-4 matrix to form a 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> vcat(Matrix<Element, 1, 4> const& rhs) const {
        return Matrix<Element, 2, 4>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-4 matrix to form a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> vcat(Matrix<Element, 2, 4> const& rhs) const {
        return Matrix<Element, 3, 4>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-4 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> vcat(Matrix<Element, 3, 4> const& rhs) const {
        return Matrix<Element, 4, 4>::vcat(*this, rhs);
    }

    /// Elementwise add operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        return result;
    }

    /// Elementwise add operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        return *this;
    }

    /// Elementwise subtract operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        return result;
    }

    /// Elementwise subtract operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        return *this;
    }

    /// Elementwise multiply operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        return result;
    }

    /// Scalar multiply operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        return result;
    }

    /// Scalar multiply operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;
        data[3] *= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        return result;
    }

    /// Scalar divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        return result;
    }

    /// Scalar divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;
        data[3] /= s;

        return *this;
    }

    /// Elementwise divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (1-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];

        return m;
    }

    /// Matrix product of size 1-by-1-by-4
    CUTLASS_HOST_DEVICE
    Element product(Matrix<Element, 4, 1> const& rhs,
                    Element accum = Element()) const {
        // k=0
        accum += data[0] * rhs.data[0];

        // k=1
        accum += data[1] * rhs.data[1];

        // k=2
        accum += data[2] * rhs.data[2];

        // k=3
        accum += data[3] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 1-by-1-by-4
    CUTLASS_HOST_DEVICE
    Element operator*(Matrix<Element, 4, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> product(
            Matrix<Element, 4, 2> const& rhs,
            Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];

        // k=3
        accum.data[0] += data[3] * rhs.data[6];
        accum.data[1] += data[3] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 1-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> operator*(Matrix<Element, 4, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> product(
            Matrix<Element, 4, 3> const& rhs,
            Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];

        // k=3
        accum.data[0] += data[3] * rhs.data[9];
        accum.data[1] += data[3] * rhs.data[10];
        accum.data[2] += data[3] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 1-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> operator*(Matrix<Element, 4, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> product(
            Matrix<Element, 4, 4> const& rhs,
            Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];

        // k=3
        accum.data[0] += data[3] * rhs.data[12];
        accum.data[1] += data[3] * rhs.data[13];
        accum.data[2] += data[3] * rhs.data[14];
        accum.data[3] += data[3] * rhs.data[15];

        return accum;
    }

    /// Matrix product of size 1-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> operator*(Matrix<Element, 4, 4> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 1-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 4, 4> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Dot product of vectors with extent 4
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 4, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        accum += data[3] * rhs.data[3];
        return accum;
    }

    /// Dot product of vectors with extent 4
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 4> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        accum += data[3] * rhs.data[3];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }
};

/// Template alias for 1-by-4 matrix
template <typename Element>
using Matrix1x4 = Matrix<Element, 1, 4>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix1x4<Element> make_Matrix1x4(Element _0_0,
                                                      Element _0_1,
                                                      Element _0_2,
                                                      Element _0_3) {
    return Matrix1x4<Element>(_0_0, _0_1, _0_2, _0_3);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 2-by-1 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 2, 1> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 2;

    /// Number of columns in matrix
    static int const kColumns = 1;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 2;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 2-by-1 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _1_0) {
        data[0] = _0_0;
        data[1] = _1_0;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> transpose() const {
        Matrix<Element, 1, 2> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> column(int j) const { return slice_2x1(0, j); }

    Matrix& set_column(Matrix<Element, 2, 1> const& v, int j = 0) {
        return set_slice_2x1(v, 0, j);
    }

    /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> hcat(Matrix<Element, 2, 1> const& rhs) const {
        return Matrix<Element, 2, 2>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 2> const& rhs) const {
        return Matrix<Element, 2, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-3 matrix to form a 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 3> const& rhs) const {
        return Matrix<Element, 2, 4>::hcat(*this, rhs);
    }

    /// Forms a 2-by-1 matrix by vertically concatenating an Element with an
    /// Element
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Element upper, Element lower) {
        return Matrix(upper, lower);
    }

    /// Concatenates this matrix with a an Element to form a 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> vcat(Element rhs) const {
        return Matrix<Element, 3, 1>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-1 matrix to form a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> vcat(Matrix<Element, 2, 1> const& rhs) const {
        return Matrix<Element, 4, 1>::vcat(*this, rhs);
    }

    /// Elementwise add operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];

        result.data[1] = data[1] + rhs.data[1];

        return result;
    }

    /// Elementwise add operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];

        data[1] += rhs.data[1];

        return *this;
    }

    /// Elementwise subtract operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];

        result.data[1] = data[1] - rhs.data[1];

        return result;
    }

    /// Elementwise subtract operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];

        data[1] -= rhs.data[1];

        return *this;
    }

    /// Elementwise multiply operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];

        result.data[1] = data[1] * rhs.data[1];

        return result;
    }

    /// Scalar multiply operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;

        result.data[1] = data[1] * s;

        return result;
    }

    /// Scalar multiply operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;

        data[1] *= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];

        result.data[1] = data[1] / rhs.data[1];

        return result;
    }

    /// Scalar divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;

        result.data[1] = data[1] / s;

        return result;
    }

    /// Scalar divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;

        data[1] /= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (2-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];

        data[1] /= rhs.data[1];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];

        return m;
    }

    /// Matrix product of size 2-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> product(
            Matrix<Element, 1, 1> const& rhs,
            Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[1] * rhs.data[0];

        return accum;
    }

    /// Matrix product of size 2-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> operator*(Matrix<Element, 1, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 1, 1> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 2-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> product(
            Matrix<Element, 1, 2> const& rhs,
            Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[1] * rhs.data[0];
        accum.data[3] += data[1] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 2-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> operator*(Matrix<Element, 1, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> product(
            Matrix<Element, 1, 3> const& rhs,
            Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[1] * rhs.data[0];
        accum.data[4] += data[1] * rhs.data[1];
        accum.data[5] += data[1] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 2-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> operator*(Matrix<Element, 1, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> product(
            Matrix<Element, 1, 4> const& rhs,
            Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[1] * rhs.data[0];
        accum.data[5] += data[1] * rhs.data[1];
        accum.data[6] += data[1] * rhs.data[2];
        accum.data[7] += data[1] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 2-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> operator*(Matrix<Element, 1, 4> const& rhs) const {
        return product(rhs);
    }

    /// Dot product of vectors with extent 2
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 2, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        return accum;
    }

    /// Dot product of vectors with extent 2
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 2> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }
};

/// Template alias for 2-by-1 matrix
template <typename Element>
using Matrix2x1 = Matrix<Element, 2, 1>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix2x1<Element> make_Matrix2x1(Element _0_0,
                                                      Element _1_0) {
    return Matrix2x1<Element>(_0_0, _1_0);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 2-by-2 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 2, 2> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 2;

    /// Number of columns in matrix
    static int const kColumns = 2;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 4;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 2-by-2 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _1_0, Element _1_1) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _1_0;
        data[3] = _1_1;
    }

    /// Constucts a 2-by-2 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 2> const& row_0,
           Matrix<Element, 1, 2> const& row_1) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_1.data[0];
        data[3] = row_1.data[1];
    }

    /// Static method to construct a 2-by-2 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 2, 1> const& column_0,
                               Matrix<Element, 2, 1> const& column_1) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_0.data[1];
        result.data[3] = column_1.data[1];
        return result;
    }

    /// Constructs an identity matrix
    CUTLASS_HOST_DEVICE
    static Matrix identity() {
        Matrix m;

        m.data[0] = Element(1);
        m.data[3] = Element(1);

        return m;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 2, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 2> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> diagonal() const {
        Matrix<Element, 2, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[3];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> transpose() const {
        Matrix<Element, 2, 2> mt;

        mt.data[0] = data[0];
        mt.data[2] = data[1];
        mt.data[1] = data[2];
        mt.data[3] = data[3];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> row(int i) const { return slice_1x2(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 2> const& v, int i = 0) {
        return set_slice_1x2(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> column(int j) const { return slice_2x1(0, j); }

    Matrix& set_column(Matrix<Element, 2, 1> const& v, int j = 0) {
        return set_slice_2x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];

        return *this;
    }

    /// Forms a 2-by-2 matrix by horizontally concatenating a 2-by-1 matrix with
    /// a 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 1> const& lhs,
                       Matrix<Element, 2, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), lhs.at(1, 0), rhs.at(1, 0));
    }

    /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 1> const& rhs) const {
        return Matrix<Element, 2, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 2> const& rhs) const {
        return Matrix<Element, 2, 4>::hcat(*this, rhs);
    }

    /// Forms a 2-by-2 matrix by vertically concatenating a 1-by-2 matrix with a
    /// 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 2> const& upper,
                       Matrix<Element, 1, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), lower.at(0, 0),
                      lower.at(0, 1));
    }

    /// Concatenates this matrix with a a 1-by-2 matrix to form a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> vcat(Matrix<Element, 1, 2> const& rhs) const {
        return Matrix<Element, 3, 2>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-2 matrix to form a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> vcat(Matrix<Element, 2, 2> const& rhs) const {
        return Matrix<Element, 4, 2>::vcat(*this, rhs);
    }

    /// Forms a 2-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Element B, Element C, Element D) {
        return Matrix(A, B, C, D);
    }

    /// Elementwise add operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];

        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        return result;
    }

    /// Elementwise add operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];

        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        return *this;
    }

    /// Elementwise subtract operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];

        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        return result;
    }

    /// Elementwise subtract operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];

        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        return *this;
    }

    /// Elementwise multiply operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];

        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        return result;
    }

    /// Scalar multiply operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;

        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        return result;
    }

    /// Scalar multiply operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;

        data[2] *= s;
        data[3] *= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];

        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        return result;
    }

    /// Scalar divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;

        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        return result;
    }

    /// Scalar divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;

        data[2] /= s;
        data[3] /= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (2-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];

        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];

        return m;
    }

    /// Matrix product of size 2-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> product(
            Matrix<Element, 2, 1> const& rhs,
            Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[2] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[3] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 2-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> operator*(Matrix<Element, 2, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> product(
            Matrix<Element, 2, 2> const& rhs,
            Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[2] * rhs.data[0];
        accum.data[3] += data[2] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[3] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 2-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> operator*(Matrix<Element, 2, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 2, 2> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 2-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> product(
            Matrix<Element, 2, 3> const& rhs,
            Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[2] * rhs.data[0];
        accum.data[4] += data[2] * rhs.data[1];
        accum.data[5] += data[2] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[3] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 2-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> operator*(Matrix<Element, 2, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> product(
            Matrix<Element, 2, 4> const& rhs,
            Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[2] * rhs.data[0];
        accum.data[5] += data[2] * rhs.data[1];
        accum.data[6] += data[2] * rhs.data[2];
        accum.data[7] += data[2] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];
        accum.data[6] += data[3] * rhs.data[6];
        accum.data[7] += data[3] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 2-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> operator*(Matrix<Element, 2, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[3];

        return accum;
    }

    /// Returns 2-by-2 rotation matrix
    CUTLASS_HOST_DEVICE
    static Matrix rotation(Element theta) {
        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        return Matrix(c, -s, s, c);
    }

    /// Computes the determinant of a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    Element determinant(Element accum = Element()) const {
        accum += data[0] * data[3] - data[1] * data[2];

        return accum;
    }

    /// Computes the inverse of a 2-by-2 matrix given
    /// the matrix's determinant
    CUTLASS_HOST_DEVICE
    Matrix inverse(Element det) const {
        return Matrix(data[3], -data[1], -data[2], data[0]) *
               (Element(1) / det);
    }

    /// Computes the inverse of a 2-by-2 matrix.
    CUTLASS_HOST_DEVICE
    Matrix inverse() const { return inverse(determinant()); }
};

/// Template alias for 2-by-2 matrix
template <typename Element>
using Matrix2x2 = Matrix<Element, 2, 2>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix2x2<Element> make_Matrix2x2(Element _0_0,
                                                      Element _0_1,
                                                      Element _1_0,
                                                      Element _1_1) {
    return Matrix2x2<Element>(_0_0, _0_1, _1_0, _1_1);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 2-by-3 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 2, 3> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 2;

    /// Number of columns in matrix
    static int const kColumns = 3;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 6;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 2-by-3 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
           Element _1_2) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _1_0;
        data[4] = _1_1;
        data[5] = _1_2;
    }

    /// Constucts a 2-by-3 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 3> const& row_0,
           Matrix<Element, 1, 3> const& row_1) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_1.data[0];
        data[4] = row_1.data[1];
        data[5] = row_1.data[2];
    }

    /// Static method to construct a 2-by-3 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 3, 1> const& column_0,
                               Matrix<Element, 3, 1> const& column_1,
                               Matrix<Element, 3, 1> const& column_2) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_0.data[1];
        result.data[4] = column_1.data[1];
        result.data[5] = column_2.data[1];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 2, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 2> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> diagonal() const {
        Matrix<Element, 2, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[3];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> transpose() const {
        Matrix<Element, 3, 2> mt;

        mt.data[0] = data[0];
        mt.data[2] = data[1];
        mt.data[4] = data[2];
        mt.data[1] = data[3];
        mt.data[3] = data[4];
        mt.data[5] = data[5];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> row(int i) const { return slice_1x3(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 3> const& v, int i = 0) {
        return set_slice_1x3(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> column(int j) const { return slice_2x1(0, j); }

    Matrix& set_column(Matrix<Element, 2, 1> const& v, int j = 0) {
        return set_slice_2x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];

        return *this;
    }

    /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-1 matrix with
    /// a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 1> const& lhs,
                       Matrix<Element, 2, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), lhs.at(1, 0),
                      rhs.at(1, 0), rhs.at(1, 1));
    }

    /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-2 matrix with
    /// a 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 2> const& lhs,
                       Matrix<Element, 2, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), lhs.at(1, 0),
                      lhs.at(1, 1), rhs.at(1, 0));
    }

    /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 1> const& rhs) const {
        return Matrix<Element, 2, 4>::hcat(*this, rhs);
    }

    /// Forms a 2-by-3 matrix by vertically concatenating a 1-by-3 matrix with a
    /// 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 3> const& upper,
                       Matrix<Element, 1, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
    }

    /// Concatenates this matrix with a a 1-by-3 matrix to form a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> vcat(Matrix<Element, 1, 3> const& rhs) const {
        return Matrix<Element, 3, 3>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-3 matrix to form a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> vcat(Matrix<Element, 2, 3> const& rhs) const {
        return Matrix<Element, 4, 3>::vcat(*this, rhs);
    }

    /// Forms a 2-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 2> const& B, Element C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), C, D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 2-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A, Element B,
                        Matrix<Element, 1, 2> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B, C.at(0, 0), C.at(0, 1), D);
    }

    /// Elementwise add operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];

        result.data[3] = data[3] + rhs.data[3];
        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];

        return result;
    }

    /// Elementwise add operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];

        data[3] += rhs.data[3];
        data[4] += rhs.data[4];
        data[5] += rhs.data[5];

        return *this;
    }

    /// Elementwise subtract operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];

        result.data[3] = data[3] - rhs.data[3];
        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];

        return result;
    }

    /// Elementwise subtract operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];

        data[3] -= rhs.data[3];
        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];

        return *this;
    }

    /// Elementwise multiply operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];

        result.data[3] = data[3] * rhs.data[3];
        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];

        return result;
    }

    /// Scalar multiply operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;

        result.data[3] = data[3] * s;
        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;

        return result;
    }

    /// Scalar multiply operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;

        data[3] *= s;
        data[4] *= s;
        data[5] *= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];

        result.data[3] = data[3] / rhs.data[3];
        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];

        return result;
    }

    /// Scalar divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;

        result.data[3] = data[3] / s;
        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;

        return result;
    }

    /// Scalar divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;

        data[3] /= s;
        data[4] /= s;
        data[5] /= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (2-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];

        data[3] /= rhs.data[3];
        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];

        return m;
    }

    /// Matrix product of size 2-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> product(
            Matrix<Element, 3, 1> const& rhs,
            Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[3] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[4] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[5] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 2-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> operator*(Matrix<Element, 3, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> product(
            Matrix<Element, 3, 2> const& rhs,
            Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[3] * rhs.data[0];
        accum.data[3] += data[3] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[4] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[5] * rhs.data[4];
        accum.data[3] += data[5] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 2-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> operator*(Matrix<Element, 3, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> product(
            Matrix<Element, 3, 3> const& rhs,
            Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[0];
        accum.data[4] += data[3] * rhs.data[1];
        accum.data[5] += data[3] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[4] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[5] * rhs.data[6];
        accum.data[4] += data[5] * rhs.data[7];
        accum.data[5] += data[5] * rhs.data[8];

        return accum;
    }

    /// Matrix product of size 2-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> operator*(Matrix<Element, 3, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 3, 3> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 2-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> product(
            Matrix<Element, 3, 4> const& rhs,
            Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[0];
        accum.data[5] += data[3] * rhs.data[1];
        accum.data[6] += data[3] * rhs.data[2];
        accum.data[7] += data[3] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];
        accum.data[6] += data[4] * rhs.data[6];
        accum.data[7] += data[4] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[5] * rhs.data[8];
        accum.data[5] += data[5] * rhs.data[9];
        accum.data[6] += data[5] * rhs.data[10];
        accum.data[7] += data[5] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 2-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> operator*(Matrix<Element, 3, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[4];

        return accum;
    }
};

/// Template alias for 2-by-3 matrix
template <typename Element>
using Matrix2x3 = Matrix<Element, 2, 3>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix2x3<Element> make_Matrix2x3(
        Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
        Element _1_2) {
    return Matrix2x3<Element>(_0_0, _0_1, _0_2, _1_0, _1_1, _1_2);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 2-by-4 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 2, 4> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 2;

    /// Number of columns in matrix
    static int const kColumns = 4;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 8;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 2-by-4 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
           Element _1_1, Element _1_2, Element _1_3) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _0_3;
        data[4] = _1_0;
        data[5] = _1_1;
        data[6] = _1_2;
        data[7] = _1_3;
    }

    /// Constucts a 2-by-4 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 4> const& row_0,
           Matrix<Element, 1, 4> const& row_1) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_0.data[3];
        data[4] = row_1.data[0];
        data[5] = row_1.data[1];
        data[6] = row_1.data[2];
        data[7] = row_1.data[3];
    }

    /// Static method to construct a 2-by-4 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 4, 1> const& column_0,
                               Matrix<Element, 4, 1> const& column_1,
                               Matrix<Element, 4, 1> const& column_2,
                               Matrix<Element, 4, 1> const& column_3) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_3.data[0];
        result.data[4] = column_0.data[1];
        result.data[5] = column_1.data[1];
        result.data[6] = column_2.data[1];
        result.data[7] = column_3.data[1];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 2, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 2> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[3] = diag.data[1];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> diagonal() const {
        Matrix<Element, 2, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[3];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> transpose() const {
        Matrix<Element, 4, 2> mt;

        mt.data[0] = data[0];
        mt.data[2] = data[1];
        mt.data[4] = data[2];
        mt.data[6] = data[3];
        mt.data[1] = data[4];
        mt.data[3] = data[5];
        mt.data[5] = data[6];
        mt.data[7] = data[7];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 2 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
        Matrix<Element, 1, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x4(Matrix<Element, 1, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> row(int i) const { return slice_1x4(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 4> const& v, int i = 0) {
        return set_slice_1x4(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> column(int j) const { return slice_2x1(0, j); }

    Matrix& set_column(Matrix<Element, 2, 1> const& v, int j = 0) {
        return set_slice_2x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
        Matrix<Element, 2, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x4(Matrix<Element, 2, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];

        return *this;
    }

    /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-1 matrix with
    /// a 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 1> const& lhs,
                       Matrix<Element, 2, 3> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2),
                      lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2));
    }

    /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-2 matrix with
    /// a 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 2> const& lhs,
                       Matrix<Element, 2, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1),
                      lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1));
    }

    /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-3 matrix with
    /// a 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 2, 3> const& lhs,
                       Matrix<Element, 2, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0),
                      lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0));
    }

    /// Forms a 2-by-4 matrix by vertically concatenating a 1-by-4 matrix with a
    /// 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 4> const& upper,
                       Matrix<Element, 1, 4> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(0, 3), lower.at(0, 0), lower.at(0, 1),
                      lower.at(0, 2), lower.at(0, 3));
    }

    /// Concatenates this matrix with a a 1-by-4 matrix to form a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> vcat(Matrix<Element, 1, 4> const& rhs) const {
        return Matrix<Element, 3, 4>::vcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 2-by-4 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> vcat(Matrix<Element, 2, 4> const& rhs) const {
        return Matrix<Element, 4, 4>::vcat(*this, rhs);
    }

    /// Forms a 2-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 3> const& B, Element C,
                        Matrix<Element, 1, 3> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), B.at(0, 2), C, D.at(0, 0),
                      D.at(0, 1), D.at(0, 2));
    }

    /// Forms a 2-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A,
                        Matrix<Element, 1, 2> const& B,
                        Matrix<Element, 1, 2> const& C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 2-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 3> const& A, Element B,
                        Matrix<Element, 1, 3> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B, C.at(0, 0),
                      C.at(0, 1), C.at(0, 2), D);
    }

    /// Elementwise add operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];
        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];

        return result;
    }

    /// Elementwise add operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        data[4] += rhs.data[4];
        data[5] += rhs.data[5];
        data[6] += rhs.data[6];
        data[7] += rhs.data[7];

        return *this;
    }

    /// Elementwise subtract operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];
        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];

        return result;
    }

    /// Elementwise subtract operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];
        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];

        return *this;
    }

    /// Elementwise multiply operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];
        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];

        return result;
    }

    /// Scalar multiply operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;
        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;

        return result;
    }

    /// Scalar multiply operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;
        data[3] *= s;

        data[4] *= s;
        data[5] *= s;
        data[6] *= s;
        data[7] *= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];
        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];

        return result;
    }

    /// Scalar divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;
        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;

        return result;
    }

    /// Scalar divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;
        data[3] /= s;

        data[4] /= s;
        data[5] /= s;
        data[6] /= s;
        data[7] /= s;

        return *this;
    }

    /// Elementwise divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (2-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];
        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];

        return m;
    }

    /// Matrix product of size 2-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> product(
            Matrix<Element, 4, 1> const& rhs,
            Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[4] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[5] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[6] * rhs.data[2];

        // k=3
        accum.data[0] += data[3] * rhs.data[3];
        accum.data[1] += data[7] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 2-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> operator*(Matrix<Element, 4, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> product(
            Matrix<Element, 4, 2> const& rhs,
            Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[4] * rhs.data[0];
        accum.data[3] += data[4] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[5] * rhs.data[2];
        accum.data[3] += data[5] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[6] * rhs.data[4];
        accum.data[3] += data[6] * rhs.data[5];

        // k=3
        accum.data[0] += data[3] * rhs.data[6];
        accum.data[1] += data[3] * rhs.data[7];
        accum.data[2] += data[7] * rhs.data[6];
        accum.data[3] += data[7] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 2-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> operator*(Matrix<Element, 4, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> product(
            Matrix<Element, 4, 3> const& rhs,
            Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[0];
        accum.data[4] += data[4] * rhs.data[1];
        accum.data[5] += data[4] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[5] * rhs.data[3];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[6] * rhs.data[6];
        accum.data[4] += data[6] * rhs.data[7];
        accum.data[5] += data[6] * rhs.data[8];

        // k=3
        accum.data[0] += data[3] * rhs.data[9];
        accum.data[1] += data[3] * rhs.data[10];
        accum.data[2] += data[3] * rhs.data[11];
        accum.data[3] += data[7] * rhs.data[9];
        accum.data[4] += data[7] * rhs.data[10];
        accum.data[5] += data[7] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 2-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> operator*(Matrix<Element, 4, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> product(
            Matrix<Element, 4, 4> const& rhs,
            Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[0];
        accum.data[5] += data[4] * rhs.data[1];
        accum.data[6] += data[4] * rhs.data[2];
        accum.data[7] += data[4] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];
        accum.data[6] += data[5] * rhs.data[6];
        accum.data[7] += data[5] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[6] * rhs.data[8];
        accum.data[5] += data[6] * rhs.data[9];
        accum.data[6] += data[6] * rhs.data[10];
        accum.data[7] += data[6] * rhs.data[11];

        // k=3
        accum.data[0] += data[3] * rhs.data[12];
        accum.data[1] += data[3] * rhs.data[13];
        accum.data[2] += data[3] * rhs.data[14];
        accum.data[3] += data[3] * rhs.data[15];
        accum.data[4] += data[7] * rhs.data[12];
        accum.data[5] += data[7] * rhs.data[13];
        accum.data[6] += data[7] * rhs.data[14];
        accum.data[7] += data[7] * rhs.data[15];

        return accum;
    }

    /// Matrix product of size 2-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> operator*(Matrix<Element, 4, 4> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 2-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 4, 4> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[5];

        return accum;
    }
};

/// Template alias for 2-by-4 matrix
template <typename Element>
using Matrix2x4 = Matrix<Element, 2, 4>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix2x4<Element> make_Matrix2x4(
        Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
        Element _1_1, Element _1_2, Element _1_3) {
    return Matrix2x4<Element>(_0_0, _0_1, _0_2, _0_3, _1_0, _1_1, _1_2, _1_3);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 3-by-1 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 3, 1> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 3;

    /// Number of columns in matrix
    static int const kColumns = 1;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 3;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 3-by-1 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _1_0, Element _2_0) {
        data[0] = _0_0;
        data[1] = _1_0;
        data[2] = _2_0;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> transpose() const {
        Matrix<Element, 1, 3> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];
        mt.data[2] = data[2];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];
        m.data[2] = data[i * 1 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];
        data[i * 1 + j + 2] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> column(int j) const { return slice_3x1(0, j); }

    Matrix& set_column(Matrix<Element, 3, 1> const& v, int j = 0) {
        return set_slice_3x1(v, 0, j);
    }

    /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> hcat(Matrix<Element, 3, 1> const& rhs) const {
        return Matrix<Element, 3, 2>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 2> const& rhs) const {
        return Matrix<Element, 3, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-3 matrix to form a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 3> const& rhs) const {
        return Matrix<Element, 3, 4>::hcat(*this, rhs);
    }

    /// Forms a 3-by-1 matrix by vertically concatenating an Element with a
    /// 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Element upper, Matrix<Element, 2, 1> const& lower) {
        return Matrix(upper, lower.at(0, 0), lower.at(1, 0));
    }

    /// Forms a 3-by-1 matrix by vertically concatenating a 2-by-1 matrix with
    /// an Element
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 1> const& upper, Element lower) {
        return Matrix(upper.at(0, 0), upper.at(1, 0), lower);
    }

    /// Concatenates this matrix with a an Element to form a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> vcat(Element rhs) const {
        return Matrix<Element, 4, 1>::vcat(*this, rhs);
    }

    /// Elementwise add operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];

        result.data[1] = data[1] + rhs.data[1];

        result.data[2] = data[2] + rhs.data[2];

        return result;
    }

    /// Elementwise add operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];

        data[1] += rhs.data[1];

        data[2] += rhs.data[2];

        return *this;
    }

    /// Elementwise subtract operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];

        result.data[1] = data[1] - rhs.data[1];

        result.data[2] = data[2] - rhs.data[2];

        return result;
    }

    /// Elementwise subtract operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];

        data[1] -= rhs.data[1];

        data[2] -= rhs.data[2];

        return *this;
    }

    /// Elementwise multiply operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];

        result.data[1] = data[1] * rhs.data[1];

        result.data[2] = data[2] * rhs.data[2];

        return result;
    }

    /// Scalar multiply operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;

        result.data[1] = data[1] * s;

        result.data[2] = data[2] * s;

        return result;
    }

    /// Scalar multiply operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;

        data[1] *= s;

        data[2] *= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];

        result.data[1] = data[1] / rhs.data[1];

        result.data[2] = data[2] / rhs.data[2];

        return result;
    }

    /// Scalar divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;

        result.data[1] = data[1] / s;

        result.data[2] = data[2] / s;

        return result;
    }

    /// Scalar divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;

        data[1] /= s;

        data[2] /= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (3-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];

        data[1] /= rhs.data[1];

        data[2] /= rhs.data[2];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];

        return m;
    }

    /// Matrix product of size 3-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> product(
            Matrix<Element, 1, 1> const& rhs,
            Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[1] * rhs.data[0];
        accum.data[2] += data[2] * rhs.data[0];

        return accum;
    }

    /// Matrix product of size 3-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> operator*(Matrix<Element, 1, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 1, 1> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 3-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> product(
            Matrix<Element, 1, 2> const& rhs,
            Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[1] * rhs.data[0];
        accum.data[3] += data[1] * rhs.data[1];
        accum.data[4] += data[2] * rhs.data[0];
        accum.data[5] += data[2] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 3-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> operator*(Matrix<Element, 1, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> product(
            Matrix<Element, 1, 3> const& rhs,
            Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[1] * rhs.data[0];
        accum.data[4] += data[1] * rhs.data[1];
        accum.data[5] += data[1] * rhs.data[2];
        accum.data[6] += data[2] * rhs.data[0];
        accum.data[7] += data[2] * rhs.data[1];
        accum.data[8] += data[2] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 3-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> operator*(Matrix<Element, 1, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> product(
            Matrix<Element, 1, 4> const& rhs,
            Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[1] * rhs.data[0];
        accum.data[5] += data[1] * rhs.data[1];
        accum.data[6] += data[1] * rhs.data[2];
        accum.data[7] += data[1] * rhs.data[3];
        accum.data[8] += data[2] * rhs.data[0];
        accum.data[9] += data[2] * rhs.data[1];
        accum.data[10] += data[2] * rhs.data[2];
        accum.data[11] += data[2] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 3-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> operator*(Matrix<Element, 1, 4> const& rhs) const {
        return product(rhs);
    }

    /// Dot product of vectors with extent 3
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 3, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        return accum;
    }

    /// Dot product of vectors with extent 3
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 3> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }

    /// Cross product
    CUTLASS_HOST_DEVICE
    Matrix cross(Matrix const& rhs) const {
        return Matrix(data[1] * rhs.data[2] - data[2] * rhs.data[1],
                      data[0] * rhs.data[2] - data[2] * rhs.data[1],
                      data[0] * rhs.data[1] - data[1] * rhs.data[0]);
    }
};

/// Template alias for 3-by-1 matrix
template <typename Element>
using Matrix3x1 = Matrix<Element, 3, 1>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix3x1<Element> make_Matrix3x1(Element _0_0,
                                                      Element _1_0,
                                                      Element _2_0) {
    return Matrix3x1<Element>(_0_0, _1_0, _2_0);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 3-by-2 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 3, 2> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 3;

    /// Number of columns in matrix
    static int const kColumns = 2;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 6;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 3-by-2 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _1_0, Element _1_1, Element _2_0,
           Element _2_1) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _1_0;
        data[3] = _1_1;
        data[4] = _2_0;
        data[5] = _2_1;
    }

    /// Constucts a 3-by-2 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 2> const& row_0,
           Matrix<Element, 1, 2> const& row_1,
           Matrix<Element, 1, 2> const& row_2) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_1.data[0];
        data[3] = row_1.data[1];
        data[4] = row_2.data[0];
        data[5] = row_2.data[1];
    }

    /// Static method to construct a 3-by-2 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 2, 1> const& column_0,
                               Matrix<Element, 2, 1> const& column_1) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_0.data[1];
        result.data[3] = column_1.data[1];
        result.data[4] = column_0.data[2];
        result.data[5] = column_1.data[2];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 2, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 2> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> diagonal() const {
        Matrix<Element, 2, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[4];
        diag.data[2] = data[8];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> transpose() const {
        Matrix<Element, 2, 3> mt;

        mt.data[0] = data[0];
        mt.data[3] = data[1];
        mt.data[1] = data[2];
        mt.data[4] = data[3];
        mt.data[2] = data[4];
        mt.data[5] = data[5];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> row(int i) const { return slice_1x2(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 2> const& v, int i = 0) {
        return set_slice_1x2(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];
        m.data[2] = data[i * 2 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];
        data[i * 2 + j + 4] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> column(int j) const { return slice_3x1(0, j); }

    Matrix& set_column(Matrix<Element, 3, 1> const& v, int j = 0) {
        return set_slice_3x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];
        m.data[4] = data[i * 2 + j + 4];
        m.data[5] = data[i * 2 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];
        data[i * 2 + j + 4] = m.data[4];
        data[i * 2 + j + 5] = m.data[5];

        return *this;
    }

    /// Forms a 3-by-2 matrix by horizontally concatenating a 3-by-1 matrix with
    /// a 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 1> const& lhs,
                       Matrix<Element, 3, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), lhs.at(1, 0), rhs.at(1, 0),
                      lhs.at(2, 0), rhs.at(2, 0));
    }

    /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 1> const& rhs) const {
        return Matrix<Element, 3, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 2> const& rhs) const {
        return Matrix<Element, 3, 4>::hcat(*this, rhs);
    }

    /// Forms a 3-by-2 matrix by vertically concatenating a 1-by-2 matrix with a
    /// 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 2> const& upper,
                       Matrix<Element, 2, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), lower.at(0, 0),
                      lower.at(0, 1), lower.at(1, 0), lower.at(1, 1));
    }

    /// Forms a 3-by-2 matrix by vertically concatenating a 2-by-2 matrix with a
    /// 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 2> const& upper,
                       Matrix<Element, 1, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(1, 0),
                      upper.at(1, 1), lower.at(0, 0), lower.at(0, 1));
    }

    /// Concatenates this matrix with a a 1-by-2 matrix to form a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> vcat(Matrix<Element, 1, 2> const& rhs) const {
        return Matrix<Element, 4, 2>::vcat(*this, rhs);
    }

    /// Forms a 3-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Element B, Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A, B, C.at(0, 0), D.at(0, 0), C.at(1, 0), D.at(1, 0));
    }

    /// Forms a 3-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 1> const& B, Element C, Element D) {
        return Matrix(A.at(0, 0), B.at(0, 0), A.at(1, 0), B.at(1, 0), C, D);
    }

    /// Elementwise add operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];

        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];

        return result;
    }

    /// Elementwise add operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];

        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        data[4] += rhs.data[4];
        data[5] += rhs.data[5];

        return *this;
    }

    /// Elementwise subtract operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];

        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];

        return result;
    }

    /// Elementwise subtract operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];

        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];

        return *this;
    }

    /// Elementwise multiply operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];

        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];

        return result;
    }

    /// Scalar multiply operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;

        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;

        return result;
    }

    /// Scalar multiply operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;

        data[2] *= s;
        data[3] *= s;

        data[4] *= s;
        data[5] *= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];

        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];

        return result;
    }

    /// Scalar divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;

        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;

        return result;
    }

    /// Scalar divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;

        data[2] /= s;
        data[3] /= s;

        data[4] /= s;
        data[5] /= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (3-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];

        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];

        return m;
    }

    /// Matrix product of size 3-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> product(
            Matrix<Element, 2, 1> const& rhs,
            Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[2] * rhs.data[0];
        accum.data[2] += data[4] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[3] * rhs.data[1];
        accum.data[2] += data[5] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 3-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> operator*(Matrix<Element, 2, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> product(
            Matrix<Element, 2, 2> const& rhs,
            Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[2] * rhs.data[0];
        accum.data[3] += data[2] * rhs.data[1];
        accum.data[4] += data[4] * rhs.data[0];
        accum.data[5] += data[4] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[3] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[3];
        accum.data[4] += data[5] * rhs.data[2];
        accum.data[5] += data[5] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 3-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> operator*(Matrix<Element, 2, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 2, 2> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 3-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> product(
            Matrix<Element, 2, 3> const& rhs,
            Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[2] * rhs.data[0];
        accum.data[4] += data[2] * rhs.data[1];
        accum.data[5] += data[2] * rhs.data[2];
        accum.data[6] += data[4] * rhs.data[0];
        accum.data[7] += data[4] * rhs.data[1];
        accum.data[8] += data[4] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[3] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];
        accum.data[6] += data[5] * rhs.data[3];
        accum.data[7] += data[5] * rhs.data[4];
        accum.data[8] += data[5] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 3-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> operator*(Matrix<Element, 2, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> product(
            Matrix<Element, 2, 4> const& rhs,
            Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[2] * rhs.data[0];
        accum.data[5] += data[2] * rhs.data[1];
        accum.data[6] += data[2] * rhs.data[2];
        accum.data[7] += data[2] * rhs.data[3];
        accum.data[8] += data[4] * rhs.data[0];
        accum.data[9] += data[4] * rhs.data[1];
        accum.data[10] += data[4] * rhs.data[2];
        accum.data[11] += data[4] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];
        accum.data[6] += data[3] * rhs.data[6];
        accum.data[7] += data[3] * rhs.data[7];
        accum.data[8] += data[5] * rhs.data[4];
        accum.data[9] += data[5] * rhs.data[5];
        accum.data[10] += data[5] * rhs.data[6];
        accum.data[11] += data[5] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 3-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> operator*(Matrix<Element, 2, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[3];

        return accum;
    }
};

/// Template alias for 3-by-2 matrix
template <typename Element>
using Matrix3x2 = Matrix<Element, 3, 2>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix3x2<Element> make_Matrix3x2(
        Element _0_0, Element _0_1, Element _1_0, Element _1_1, Element _2_0,
        Element _2_1) {
    return Matrix3x2<Element>(_0_0, _0_1, _1_0, _1_1, _2_0, _2_1);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 3-by-3 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 3, 3> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 3;

    /// Number of columns in matrix
    static int const kColumns = 3;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 9;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 3-by-3 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
           Element _1_2, Element _2_0, Element _2_1, Element _2_2) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _1_0;
        data[4] = _1_1;
        data[5] = _1_2;
        data[6] = _2_0;
        data[7] = _2_1;
        data[8] = _2_2;
    }

    /// Constucts a 3-by-3 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 3> const& row_0,
           Matrix<Element, 1, 3> const& row_1,
           Matrix<Element, 1, 3> const& row_2) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_1.data[0];
        data[4] = row_1.data[1];
        data[5] = row_1.data[2];
        data[6] = row_2.data[0];
        data[7] = row_2.data[1];
        data[8] = row_2.data[2];
    }

    /// Static method to construct a 3-by-3 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 3, 1> const& column_0,
                               Matrix<Element, 3, 1> const& column_1,
                               Matrix<Element, 3, 1> const& column_2) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_0.data[1];
        result.data[4] = column_1.data[1];
        result.data[5] = column_2.data[1];
        result.data[6] = column_0.data[2];
        result.data[7] = column_1.data[2];
        result.data[8] = column_2.data[2];
        return result;
    }

    /// Constructs an identity matrix
    CUTLASS_HOST_DEVICE
    static Matrix identity() {
        Matrix m;

        m.data[0] = Element(1);
        m.data[4] = Element(1);
        m.data[8] = Element(1);

        return m;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;
        m.data[8] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 3, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 3> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> diagonal() const {
        Matrix<Element, 3, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[4];
        diag.data[2] = data[8];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> transpose() const {
        Matrix<Element, 3, 3> mt;

        mt.data[0] = data[0];
        mt.data[3] = data[1];
        mt.data[6] = data[2];
        mt.data[1] = data[3];
        mt.data[4] = data[4];
        mt.data[7] = data[5];
        mt.data[2] = data[6];
        mt.data[5] = data[7];
        mt.data[8] = data[8];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> row(int i) const { return slice_1x3(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 3> const& v, int i = 0) {
        return set_slice_1x3(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];
        m.data[2] = data[i * 3 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];
        data[i * 3 + j + 6] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> column(int j) const { return slice_3x1(0, j); }

    Matrix& set_column(Matrix<Element, 3, 1> const& v, int j = 0) {
        return set_slice_3x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];
        m.data[4] = data[i * 3 + j + 6];
        m.data[5] = data[i * 3 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];
        data[i * 3 + j + 6] = m.data[4];
        data[i * 3 + j + 7] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
        Matrix<Element, 3, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];
        m.data[6] = data[i * 3 + j + 6];
        m.data[7] = data[i * 3 + j + 7];
        m.data[8] = data[i * 3 + j + 8];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x3(Matrix<Element, 3, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];
        data[i * 3 + j + 6] = m.data[6];
        data[i * 3 + j + 7] = m.data[7];
        data[i * 3 + j + 8] = m.data[8];

        return *this;
    }

    /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-1 matrix with
    /// a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 1> const& lhs,
                       Matrix<Element, 3, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), lhs.at(1, 0),
                      rhs.at(1, 0), rhs.at(1, 1), lhs.at(2, 0), rhs.at(2, 0),
                      rhs.at(2, 1));
    }

    /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-2 matrix with
    /// a 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 2> const& lhs,
                       Matrix<Element, 3, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), lhs.at(1, 0),
                      lhs.at(1, 1), rhs.at(1, 0), lhs.at(2, 0), lhs.at(2, 1),
                      rhs.at(2, 0));
    }

    /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 1> const& rhs) const {
        return Matrix<Element, 3, 4>::hcat(*this, rhs);
    }

    /// Forms a 3-by-3 matrix by vertically concatenating a 1-by-3 matrix with a
    /// 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 3> const& upper,
                       Matrix<Element, 2, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2),
                      lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
    }

    /// Forms a 3-by-3 matrix by vertically concatenating a 2-by-3 matrix with a
    /// 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 3> const& upper,
                       Matrix<Element, 1, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(1, 0), upper.at(1, 1), upper.at(1, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
    }

    /// Concatenates this matrix with a a 1-by-3 matrix to form a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> vcat(Matrix<Element, 1, 3> const& rhs) const {
        return Matrix<Element, 4, 3>::vcat(*this, rhs);
    }

    /// Forms a 3-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 2> const& B,
                        Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 2> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), C.at(0, 0), D.at(0, 0),
                      D.at(0, 1), C.at(1, 0), D.at(1, 0), D.at(1, 1));
    }

    /// Forms a 3-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A, Element B,
                        Matrix<Element, 2, 2> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B, C.at(0, 0), C.at(0, 1),
                      D.at(0, 0), C.at(1, 0), C.at(1, 1), D.at(1, 0));
    }

    /// Forms a 3-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 2> const& B, Element C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), A.at(1, 0),
                      B.at(1, 0), B.at(1, 1), C, D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 3-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 2> const& A,
                        Matrix<Element, 2, 1> const& B,
                        Matrix<Element, 1, 2> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), A.at(1, 0),
                      A.at(1, 1), B.at(1, 0), C.at(0, 0), C.at(0, 1), D);
    }

    /// Elementwise add operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];

        result.data[3] = data[3] + rhs.data[3];
        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];

        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];
        result.data[8] = data[8] + rhs.data[8];

        return result;
    }

    /// Elementwise add operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];

        data[3] += rhs.data[3];
        data[4] += rhs.data[4];
        data[5] += rhs.data[5];

        data[6] += rhs.data[6];
        data[7] += rhs.data[7];
        data[8] += rhs.data[8];

        return *this;
    }

    /// Elementwise subtract operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];

        result.data[3] = data[3] - rhs.data[3];
        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];

        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];
        result.data[8] = data[8] - rhs.data[8];

        return result;
    }

    /// Elementwise subtract operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];

        data[3] -= rhs.data[3];
        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];

        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];
        data[8] -= rhs.data[8];

        return *this;
    }

    /// Elementwise multiply operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];

        result.data[3] = data[3] * rhs.data[3];
        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];

        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];
        result.data[8] = data[8] * rhs.data[8];

        return result;
    }

    /// Scalar multiply operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;

        result.data[3] = data[3] * s;
        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;

        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;
        result.data[8] = data[8] * s;

        return result;
    }

    /// Scalar multiply operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;

        data[3] *= s;
        data[4] *= s;
        data[5] *= s;

        data[6] *= s;
        data[7] *= s;
        data[8] *= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];

        result.data[3] = data[3] / rhs.data[3];
        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];

        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];
        result.data[8] = data[8] / rhs.data[8];

        return result;
    }

    /// Scalar divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;

        result.data[3] = data[3] / s;
        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;

        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;
        result.data[8] = data[8] / s;

        return result;
    }

    /// Scalar divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;

        data[3] /= s;
        data[4] /= s;
        data[5] /= s;

        data[6] /= s;
        data[7] /= s;
        data[8] /= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (3-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];

        data[3] /= rhs.data[3];
        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];

        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];
        data[8] /= rhs.data[8];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];
        m.data[8] = -m.data[8];

        return m;
    }

    /// Matrix product of size 3-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> product(
            Matrix<Element, 3, 1> const& rhs,
            Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[3] * rhs.data[0];
        accum.data[2] += data[6] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[4] * rhs.data[1];
        accum.data[2] += data[7] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[5] * rhs.data[2];
        accum.data[2] += data[8] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 3-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> operator*(Matrix<Element, 3, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> product(
            Matrix<Element, 3, 2> const& rhs,
            Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[3] * rhs.data[0];
        accum.data[3] += data[3] * rhs.data[1];
        accum.data[4] += data[6] * rhs.data[0];
        accum.data[5] += data[6] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[4] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[3];
        accum.data[4] += data[7] * rhs.data[2];
        accum.data[5] += data[7] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[5] * rhs.data[4];
        accum.data[3] += data[5] * rhs.data[5];
        accum.data[4] += data[8] * rhs.data[4];
        accum.data[5] += data[8] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 3-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> operator*(Matrix<Element, 3, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> product(
            Matrix<Element, 3, 3> const& rhs,
            Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[0];
        accum.data[4] += data[3] * rhs.data[1];
        accum.data[5] += data[3] * rhs.data[2];
        accum.data[6] += data[6] * rhs.data[0];
        accum.data[7] += data[6] * rhs.data[1];
        accum.data[8] += data[6] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[4] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];
        accum.data[6] += data[7] * rhs.data[3];
        accum.data[7] += data[7] * rhs.data[4];
        accum.data[8] += data[7] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[5] * rhs.data[6];
        accum.data[4] += data[5] * rhs.data[7];
        accum.data[5] += data[5] * rhs.data[8];
        accum.data[6] += data[8] * rhs.data[6];
        accum.data[7] += data[8] * rhs.data[7];
        accum.data[8] += data[8] * rhs.data[8];

        return accum;
    }

    /// Matrix product of size 3-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> operator*(Matrix<Element, 3, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 3, 3> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 3-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> product(
            Matrix<Element, 3, 4> const& rhs,
            Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[0];
        accum.data[5] += data[3] * rhs.data[1];
        accum.data[6] += data[3] * rhs.data[2];
        accum.data[7] += data[3] * rhs.data[3];
        accum.data[8] += data[6] * rhs.data[0];
        accum.data[9] += data[6] * rhs.data[1];
        accum.data[10] += data[6] * rhs.data[2];
        accum.data[11] += data[6] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];
        accum.data[6] += data[4] * rhs.data[6];
        accum.data[7] += data[4] * rhs.data[7];
        accum.data[8] += data[7] * rhs.data[4];
        accum.data[9] += data[7] * rhs.data[5];
        accum.data[10] += data[7] * rhs.data[6];
        accum.data[11] += data[7] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[5] * rhs.data[8];
        accum.data[5] += data[5] * rhs.data[9];
        accum.data[6] += data[5] * rhs.data[10];
        accum.data[7] += data[5] * rhs.data[11];
        accum.data[8] += data[8] * rhs.data[8];
        accum.data[9] += data[8] * rhs.data[9];
        accum.data[10] += data[8] * rhs.data[10];
        accum.data[11] += data[8] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 3-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> operator*(Matrix<Element, 3, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];
        accum += data[8];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];
        accum += data[8] * data[8];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[4];
        accum += data[8];

        return accum;
    }

    /// Returns 3-by-3 rotation matrix around the X axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_X(Element theta) {
        Matrix m = identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(1, 1) = c;
        m.at(1, 2) = -s;
        m.at(2, 1) = s;
        m.at(2, 2) = c;

        return m;
    }

    /// Returns 3-by-3 rotation matrix around the Y axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_Y(Element theta) {
        Matrix m = identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(0, 0) = c;
        m.at(2, 0) = -s;
        m.at(0, 2) = s;
        m.at(2, 2) = c;

        return m;
    }

    /// Returns 3-by-3 rotation matrix around the Z axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_Z(Element theta) {
        Matrix m = Matrix::identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(0, 0) = c;
        m.at(0, 1) = -s;
        m.at(1, 0) = s;
        m.at(1, 1) = c;

        return m;
    }

    /// Returns a 3-by-3 rotation matrix around a unit-length axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation(Element theta, Matrix<Element, 3, 1> const& u) {
        Element x = u.data[0];
        Element y = u.data[1];
        Element z = u.data[2];

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        Element one_minus_cos = Element(1) - fast_cos(theta);

        Matrix m;

        m.set_slice3x3(
                {c + x * x * one_minus_cos, x * y * one_minus_cos - z * s,
                 x * z * one_minus_cos + y * s, y * x * one_minus_cos * z * s,
                 c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
                 z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s,
                 c + z * z * one_minus_cos});

        return m;
    }

    /// Returns a 3-by-3 reflection about the plane specified by the
    /// unit-length normal vector n_unit
    CUTLASS_HOST_DEVICE
    static Matrix reflection(Matrix<Element, 3, 1> const& n_unit) {
        Element a = n_unit.data[0];
        Element b = n_unit.data[1];
        Element c = n_unit.data[2];

        Matrix m = Matrix::identity();

        m.set_slice3x3({Element(1) - Element(2) * a * a, Element(-2) * a * b,
                        Element(-2) * a * c, Element(-2) * a * b,
                        Element(1) - Element(2) * b * b, Element(-2) * b * c,
                        Element(-2) * a * c, Element(-2) * b * c,
                        Element(1) - Element(2) * c * c});

        return m;
    }

    /// Computes the determinant of a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Element determinant(Element accum = Element()) const {
        accum += at(0, 0) *
                 Matrix<Element, 2, 2>({at(1, 1), at(1, 2), at(2, 1), at(2, 2)})
                         .determinant();
        accum -= at(0, 1) *
                 Matrix<Element, 2, 2>({at(1, 0), at(1, 2), at(2, 0), at(2, 2)})
                         .determinant();
        accum += at(0, 2) *
                 Matrix<Element, 2, 2>({at(1, 0), at(1, 1), at(2, 0), at(2, 1)})
                         .determinant();

        return accum;
    }

    /// Computes the inverse of a 3-by-3 matrix given
    /// the matrix's determinant
    CUTLASS_HOST_DEVICE
    Matrix inverse(Element det) const {
        return Matrix(at(1, 1) * at(2, 2) - at(1, 2) * at(2, 1),
                      at(0, 2) * at(2, 1) - at(0, 1) * at(2, 2),
                      at(0, 1) * at(1, 2) - at(0, 2) * at(1, 1),

                      at(1, 2) * at(2, 0) - at(1, 0) * at(2, 2),
                      at(0, 0) * at(2, 2) - at(0, 2) * at(2, 0),
                      at(0, 2) * at(1, 0) - at(0, 0) * at(1, 2),

                      at(1, 0) * at(2, 1) - at(1, 1) * at(2, 0),
                      at(0, 1) * at(2, 0) - at(0, 0) * at(2, 1),
                      at(0, 0) * at(1, 1) - at(0, 1) * at(1, 0)) *
               (Element(1) / det);
    }
    /// Computes the inverse of a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix inverse() const { return inverse(determinant()); }
};

/// Template alias for 3-by-3 matrix
template <typename Element>
using Matrix3x3 = Matrix<Element, 3, 3>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix3x3<Element> make_Matrix3x3(
        Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
        Element _1_2, Element _2_0, Element _2_1, Element _2_2) {
    return Matrix3x3<Element>(_0_0, _0_1, _0_2, _1_0, _1_1, _1_2, _2_0, _2_1,
                              _2_2);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 3-by-4 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 3, 4> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 3;

    /// Number of columns in matrix
    static int const kColumns = 4;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 12;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 3-by-4 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
           Element _1_1, Element _1_2, Element _1_3, Element _2_0, Element _2_1,
           Element _2_2, Element _2_3) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _0_3;
        data[4] = _1_0;
        data[5] = _1_1;
        data[6] = _1_2;
        data[7] = _1_3;
        data[8] = _2_0;
        data[9] = _2_1;
        data[10] = _2_2;
        data[11] = _2_3;
    }

    /// Constucts a 3-by-4 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 4> const& row_0,
           Matrix<Element, 1, 4> const& row_1,
           Matrix<Element, 1, 4> const& row_2) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_0.data[3];
        data[4] = row_1.data[0];
        data[5] = row_1.data[1];
        data[6] = row_1.data[2];
        data[7] = row_1.data[3];
        data[8] = row_2.data[0];
        data[9] = row_2.data[1];
        data[10] = row_2.data[2];
        data[11] = row_2.data[3];
    }

    /// Static method to construct a 3-by-4 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 4, 1> const& column_0,
                               Matrix<Element, 4, 1> const& column_1,
                               Matrix<Element, 4, 1> const& column_2,
                               Matrix<Element, 4, 1> const& column_3) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_3.data[0];
        result.data[4] = column_0.data[1];
        result.data[5] = column_1.data[1];
        result.data[6] = column_2.data[1];
        result.data[7] = column_3.data[1];
        result.data[8] = column_0.data[2];
        result.data[9] = column_1.data[2];
        result.data[10] = column_2.data[2];
        result.data[11] = column_3.data[2];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;
        m.data[8] = s;
        m.data[9] = s;
        m.data[10] = s;
        m.data[11] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 3, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 3> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[4] = diag.data[1];
        m.data[8] = diag.data[2];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> diagonal() const {
        Matrix<Element, 3, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[4];
        diag.data[2] = data[8];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> transpose() const {
        Matrix<Element, 4, 3> mt;

        mt.data[0] = data[0];
        mt.data[3] = data[1];
        mt.data[6] = data[2];
        mt.data[9] = data[3];
        mt.data[1] = data[4];
        mt.data[4] = data[5];
        mt.data[7] = data[6];
        mt.data[10] = data[7];
        mt.data[2] = data[8];
        mt.data[5] = data[9];
        mt.data[8] = data[10];
        mt.data[11] = data[11];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 3 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
        Matrix<Element, 1, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x4(Matrix<Element, 1, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> row(int i) const { return slice_1x4(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 4> const& v, int i = 0) {
        return set_slice_1x4(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
        Matrix<Element, 2, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x4(Matrix<Element, 2, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];
        m.data[2] = data[i * 4 + j + 8];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];
        data[i * 4 + j + 8] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> column(int j) const { return slice_3x1(0, j); }

    Matrix& set_column(Matrix<Element, 3, 1> const& v, int j = 0) {
        return set_slice_3x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];
        m.data[4] = data[i * 4 + j + 8];
        m.data[5] = data[i * 4 + j + 9];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];
        data[i * 4 + j + 8] = m.data[4];
        data[i * 4 + j + 9] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
        Matrix<Element, 3, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];
        m.data[6] = data[i * 4 + j + 8];
        m.data[7] = data[i * 4 + j + 9];
        m.data[8] = data[i * 4 + j + 10];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x3(Matrix<Element, 3, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];
        data[i * 4 + j + 8] = m.data[6];
        data[i * 4 + j + 9] = m.data[7];
        data[i * 4 + j + 10] = m.data[8];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
        Matrix<Element, 3, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];
        m.data[8] = data[i * 4 + j + 8];
        m.data[9] = data[i * 4 + j + 9];
        m.data[10] = data[i * 4 + j + 10];
        m.data[11] = data[i * 4 + j + 11];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x4(Matrix<Element, 3, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];
        data[i * 4 + j + 8] = m.data[8];
        data[i * 4 + j + 9] = m.data[9];
        data[i * 4 + j + 10] = m.data[10];
        data[i * 4 + j + 11] = m.data[11];

        return *this;
    }

    /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-1 matrix with
    /// a 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 1> const& lhs,
                       Matrix<Element, 3, 3> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2),
                      lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2),
                      lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2));
    }

    /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-2 matrix with
    /// a 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 2> const& lhs,
                       Matrix<Element, 3, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1),
                      lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1),
                      lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1));
    }

    /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-3 matrix with
    /// a 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 3, 3> const& lhs,
                       Matrix<Element, 3, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0),
                      lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0),
                      lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0));
    }

    /// Forms a 3-by-4 matrix by vertically concatenating a 1-by-4 matrix with a
    /// 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 4> const& upper,
                       Matrix<Element, 2, 4> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(0, 3), lower.at(0, 0), lower.at(0, 1),
                      lower.at(0, 2), lower.at(0, 3), lower.at(1, 0),
                      lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
    }

    /// Forms a 3-by-4 matrix by vertically concatenating a 2-by-4 matrix with a
    /// 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 4> const& upper,
                       Matrix<Element, 1, 4> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(0, 3), upper.at(1, 0), upper.at(1, 1),
                      upper.at(1, 2), upper.at(1, 3), lower.at(0, 0),
                      lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
    }

    /// Concatenates this matrix with a a 1-by-4 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> vcat(Matrix<Element, 1, 4> const& rhs) const {
        return Matrix<Element, 4, 4>::vcat(*this, rhs);
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 3> const& B,
                        Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 3> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), B.at(0, 2), C.at(0, 0),
                      D.at(0, 0), D.at(0, 1), D.at(0, 2), C.at(1, 0),
                      D.at(1, 0), D.at(1, 1), D.at(1, 2));
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A,
                        Matrix<Element, 1, 2> const& B,
                        Matrix<Element, 2, 2> const& C,
                        Matrix<Element, 2, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1),
                      C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1));
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 3> const& A, Element B,
                        Matrix<Element, 2, 3> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B, C.at(0, 0),
                      C.at(0, 1), C.at(0, 2), D.at(0, 0), C.at(1, 0),
                      C.at(1, 1), C.at(1, 2), D.at(1, 0));
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 3> const& B, Element C,
                        Matrix<Element, 1, 3> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2),
                      A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2), C,
                      D.at(0, 0), D.at(0, 1), D.at(0, 2));
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 2> const& A,
                        Matrix<Element, 2, 2> const& B,
                        Matrix<Element, 1, 2> const& C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 3-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 3> const& A,
                        Matrix<Element, 2, 1> const& B,
                        Matrix<Element, 1, 3> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0),
                      A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0),
                      C.at(0, 0), C.at(0, 1), C.at(0, 2), D);
    }

    /// Elementwise add operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];
        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];

        result.data[8] = data[8] + rhs.data[8];
        result.data[9] = data[9] + rhs.data[9];
        result.data[10] = data[10] + rhs.data[10];
        result.data[11] = data[11] + rhs.data[11];

        return result;
    }

    /// Elementwise add operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        data[4] += rhs.data[4];
        data[5] += rhs.data[5];
        data[6] += rhs.data[6];
        data[7] += rhs.data[7];

        data[8] += rhs.data[8];
        data[9] += rhs.data[9];
        data[10] += rhs.data[10];
        data[11] += rhs.data[11];

        return *this;
    }

    /// Elementwise subtract operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];
        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];

        result.data[8] = data[8] - rhs.data[8];
        result.data[9] = data[9] - rhs.data[9];
        result.data[10] = data[10] - rhs.data[10];
        result.data[11] = data[11] - rhs.data[11];

        return result;
    }

    /// Elementwise subtract operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];
        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];

        data[8] -= rhs.data[8];
        data[9] -= rhs.data[9];
        data[10] -= rhs.data[10];
        data[11] -= rhs.data[11];

        return *this;
    }

    /// Elementwise multiply operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];
        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];

        result.data[8] = data[8] * rhs.data[8];
        result.data[9] = data[9] * rhs.data[9];
        result.data[10] = data[10] * rhs.data[10];
        result.data[11] = data[11] * rhs.data[11];

        return result;
    }

    /// Scalar multiply operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;
        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;

        result.data[8] = data[8] * s;
        result.data[9] = data[9] * s;
        result.data[10] = data[10] * s;
        result.data[11] = data[11] * s;

        return result;
    }

    /// Scalar multiply operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;
        data[3] *= s;

        data[4] *= s;
        data[5] *= s;
        data[6] *= s;
        data[7] *= s;

        data[8] *= s;
        data[9] *= s;
        data[10] *= s;
        data[11] *= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];
        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];

        result.data[8] = data[8] / rhs.data[8];
        result.data[9] = data[9] / rhs.data[9];
        result.data[10] = data[10] / rhs.data[10];
        result.data[11] = data[11] / rhs.data[11];

        return result;
    }

    /// Scalar divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;
        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;

        result.data[8] = data[8] / s;
        result.data[9] = data[9] / s;
        result.data[10] = data[10] / s;
        result.data[11] = data[11] / s;

        return result;
    }

    /// Scalar divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;
        data[3] /= s;

        data[4] /= s;
        data[5] /= s;
        data[6] /= s;
        data[7] /= s;

        data[8] /= s;
        data[9] /= s;
        data[10] /= s;
        data[11] /= s;

        return *this;
    }

    /// Elementwise divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (3-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];
        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];

        data[8] /= rhs.data[8];
        data[9] /= rhs.data[9];
        data[10] /= rhs.data[10];
        data[11] /= rhs.data[11];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];
        m.data[8] = -m.data[8];
        m.data[9] = -m.data[9];
        m.data[10] = -m.data[10];
        m.data[11] = -m.data[11];

        return m;
    }

    /// Matrix product of size 3-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> product(
            Matrix<Element, 4, 1> const& rhs,
            Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[4] * rhs.data[0];
        accum.data[2] += data[8] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[5] * rhs.data[1];
        accum.data[2] += data[9] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[6] * rhs.data[2];
        accum.data[2] += data[10] * rhs.data[2];

        // k=3
        accum.data[0] += data[3] * rhs.data[3];
        accum.data[1] += data[7] * rhs.data[3];
        accum.data[2] += data[11] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 3-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> operator*(Matrix<Element, 4, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> product(
            Matrix<Element, 4, 2> const& rhs,
            Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[4] * rhs.data[0];
        accum.data[3] += data[4] * rhs.data[1];
        accum.data[4] += data[8] * rhs.data[0];
        accum.data[5] += data[8] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[5] * rhs.data[2];
        accum.data[3] += data[5] * rhs.data[3];
        accum.data[4] += data[9] * rhs.data[2];
        accum.data[5] += data[9] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[6] * rhs.data[4];
        accum.data[3] += data[6] * rhs.data[5];
        accum.data[4] += data[10] * rhs.data[4];
        accum.data[5] += data[10] * rhs.data[5];

        // k=3
        accum.data[0] += data[3] * rhs.data[6];
        accum.data[1] += data[3] * rhs.data[7];
        accum.data[2] += data[7] * rhs.data[6];
        accum.data[3] += data[7] * rhs.data[7];
        accum.data[4] += data[11] * rhs.data[6];
        accum.data[5] += data[11] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 3-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> operator*(Matrix<Element, 4, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> product(
            Matrix<Element, 4, 3> const& rhs,
            Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[0];
        accum.data[4] += data[4] * rhs.data[1];
        accum.data[5] += data[4] * rhs.data[2];
        accum.data[6] += data[8] * rhs.data[0];
        accum.data[7] += data[8] * rhs.data[1];
        accum.data[8] += data[8] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[5] * rhs.data[3];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];
        accum.data[6] += data[9] * rhs.data[3];
        accum.data[7] += data[9] * rhs.data[4];
        accum.data[8] += data[9] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[6] * rhs.data[6];
        accum.data[4] += data[6] * rhs.data[7];
        accum.data[5] += data[6] * rhs.data[8];
        accum.data[6] += data[10] * rhs.data[6];
        accum.data[7] += data[10] * rhs.data[7];
        accum.data[8] += data[10] * rhs.data[8];

        // k=3
        accum.data[0] += data[3] * rhs.data[9];
        accum.data[1] += data[3] * rhs.data[10];
        accum.data[2] += data[3] * rhs.data[11];
        accum.data[3] += data[7] * rhs.data[9];
        accum.data[4] += data[7] * rhs.data[10];
        accum.data[5] += data[7] * rhs.data[11];
        accum.data[6] += data[11] * rhs.data[9];
        accum.data[7] += data[11] * rhs.data[10];
        accum.data[8] += data[11] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 3-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> operator*(Matrix<Element, 4, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> product(
            Matrix<Element, 4, 4> const& rhs,
            Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[0];
        accum.data[5] += data[4] * rhs.data[1];
        accum.data[6] += data[4] * rhs.data[2];
        accum.data[7] += data[4] * rhs.data[3];
        accum.data[8] += data[8] * rhs.data[0];
        accum.data[9] += data[8] * rhs.data[1];
        accum.data[10] += data[8] * rhs.data[2];
        accum.data[11] += data[8] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];
        accum.data[6] += data[5] * rhs.data[6];
        accum.data[7] += data[5] * rhs.data[7];
        accum.data[8] += data[9] * rhs.data[4];
        accum.data[9] += data[9] * rhs.data[5];
        accum.data[10] += data[9] * rhs.data[6];
        accum.data[11] += data[9] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[6] * rhs.data[8];
        accum.data[5] += data[6] * rhs.data[9];
        accum.data[6] += data[6] * rhs.data[10];
        accum.data[7] += data[6] * rhs.data[11];
        accum.data[8] += data[10] * rhs.data[8];
        accum.data[9] += data[10] * rhs.data[9];
        accum.data[10] += data[10] * rhs.data[10];
        accum.data[11] += data[10] * rhs.data[11];

        // k=3
        accum.data[0] += data[3] * rhs.data[12];
        accum.data[1] += data[3] * rhs.data[13];
        accum.data[2] += data[3] * rhs.data[14];
        accum.data[3] += data[3] * rhs.data[15];
        accum.data[4] += data[7] * rhs.data[12];
        accum.data[5] += data[7] * rhs.data[13];
        accum.data[6] += data[7] * rhs.data[14];
        accum.data[7] += data[7] * rhs.data[15];
        accum.data[8] += data[11] * rhs.data[12];
        accum.data[9] += data[11] * rhs.data[13];
        accum.data[10] += data[11] * rhs.data[14];
        accum.data[11] += data[11] * rhs.data[15];

        return accum;
    }

    /// Matrix product of size 3-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> operator*(Matrix<Element, 4, 4> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 3-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 4, 4> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];
        accum += data[8];
        accum += data[9];
        accum += data[10];
        accum += data[11];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];
        accum += data[8] * data[8];
        accum += data[9] * data[9];
        accum += data[10] * data[10];
        accum += data[11] * data[11];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[5];
        accum += data[10];

        return accum;
    }
};

/// Template alias for 3-by-4 matrix
template <typename Element>
using Matrix3x4 = Matrix<Element, 3, 4>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix3x4<Element> make_Matrix3x4(
        Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
        Element _1_1, Element _1_2, Element _1_3, Element _2_0, Element _2_1,
        Element _2_2, Element _2_3) {
    return Matrix3x4<Element>(_0_0, _0_1, _0_2, _0_3, _1_0, _1_1, _1_2, _1_3,
                              _2_0, _2_1, _2_2, _2_3);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 4-by-1 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 4, 1> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 4;

    /// Number of columns in matrix
    static int const kColumns = 1;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 4;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 4-by-1 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _1_0, Element _2_0, Element _3_0) {
        data[0] = _0_0;
        data[1] = _1_0;
        data[2] = _2_0;
        data[3] = _3_0;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> transpose() const {
        Matrix<Element, 1, 4> mt;

        mt.data[0] = data[0];
        mt.data[1] = data[1];
        mt.data[2] = data[2];
        mt.data[3] = data[3];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];
        m.data[2] = data[i * 1 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];
        data[i * 1 + j + 2] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
        Matrix<Element, 4, 1> m;

        m.data[0] = data[i * 1 + j + 0];
        m.data[1] = data[i * 1 + j + 1];
        m.data[2] = data[i * 1 + j + 2];
        m.data[3] = data[i * 1 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x1(Matrix<Element, 4, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 1 + j + 0] = m.data[0];
        data[i * 1 + j + 1] = m.data[1];
        data[i * 1 + j + 2] = m.data[2];
        data[i * 1 + j + 3] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> column(int j) const { return slice_4x1(0, j); }

    Matrix& set_column(Matrix<Element, 4, 1> const& v, int j = 0) {
        return set_slice_4x1(v, 0, j);
    }

    /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> hcat(Matrix<Element, 4, 1> const& rhs) const {
        return Matrix<Element, 4, 2>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 2> const& rhs) const {
        return Matrix<Element, 4, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 4-by-3 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 3> const& rhs) const {
        return Matrix<Element, 4, 4>::hcat(*this, rhs);
    }

    /// Forms a 4-by-1 matrix by vertically concatenating an Element with a
    /// 3-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Element upper, Matrix<Element, 3, 1> const& lower) {
        return Matrix(upper, lower.at(0, 0), lower.at(1, 0), lower.at(2, 0));
    }

    /// Forms a 4-by-1 matrix by vertically concatenating a 2-by-1 matrix with a
    /// 2-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 1> const& upper,
                       Matrix<Element, 2, 1> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(1, 0), lower.at(0, 0),
                      lower.at(1, 0));
    }

    /// Forms a 4-by-1 matrix by vertically concatenating a 3-by-1 matrix with
    /// an Element
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 3, 1> const& upper, Element lower) {
        return Matrix(upper.at(0, 0), upper.at(1, 0), upper.at(2, 0), lower);
    }

    /// Elementwise add operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];

        result.data[1] = data[1] + rhs.data[1];

        result.data[2] = data[2] + rhs.data[2];

        result.data[3] = data[3] + rhs.data[3];

        return result;
    }

    /// Elementwise add operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];

        data[1] += rhs.data[1];

        data[2] += rhs.data[2];

        data[3] += rhs.data[3];

        return *this;
    }

    /// Elementwise subtract operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];

        result.data[1] = data[1] - rhs.data[1];

        result.data[2] = data[2] - rhs.data[2];

        result.data[3] = data[3] - rhs.data[3];

        return result;
    }

    /// Elementwise subtract operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];

        data[1] -= rhs.data[1];

        data[2] -= rhs.data[2];

        data[3] -= rhs.data[3];

        return *this;
    }

    /// Elementwise multiply operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];

        result.data[1] = data[1] * rhs.data[1];

        result.data[2] = data[2] * rhs.data[2];

        result.data[3] = data[3] * rhs.data[3];

        return result;
    }

    /// Scalar multiply operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;

        result.data[1] = data[1] * s;

        result.data[2] = data[2] * s;

        result.data[3] = data[3] * s;

        return result;
    }

    /// Scalar multiply operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;

        data[1] *= s;

        data[2] *= s;

        data[3] *= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];

        result.data[1] = data[1] / rhs.data[1];

        result.data[2] = data[2] / rhs.data[2];

        result.data[3] = data[3] / rhs.data[3];

        return result;
    }

    /// Scalar divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;

        result.data[1] = data[1] / s;

        result.data[2] = data[2] / s;

        result.data[3] = data[3] / s;

        return result;
    }

    /// Scalar divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;

        data[1] /= s;

        data[2] /= s;

        data[3] /= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (4-by-1)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];

        data[1] /= rhs.data[1];

        data[2] /= rhs.data[2];

        data[3] /= rhs.data[3];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];

        return m;
    }

    /// Matrix product of size 4-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> product(
            Matrix<Element, 1, 1> const& rhs,
            Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[1] * rhs.data[0];
        accum.data[2] += data[2] * rhs.data[0];
        accum.data[3] += data[3] * rhs.data[0];

        return accum;
    }

    /// Matrix product of size 4-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> operator*(Matrix<Element, 1, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-1-by-1
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 1, 1> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 4-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> product(
            Matrix<Element, 1, 2> const& rhs,
            Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[1] * rhs.data[0];
        accum.data[3] += data[1] * rhs.data[1];
        accum.data[4] += data[2] * rhs.data[0];
        accum.data[5] += data[2] * rhs.data[1];
        accum.data[6] += data[3] * rhs.data[0];
        accum.data[7] += data[3] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 4-by-2-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> operator*(Matrix<Element, 1, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> product(
            Matrix<Element, 1, 3> const& rhs,
            Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[1] * rhs.data[0];
        accum.data[4] += data[1] * rhs.data[1];
        accum.data[5] += data[1] * rhs.data[2];
        accum.data[6] += data[2] * rhs.data[0];
        accum.data[7] += data[2] * rhs.data[1];
        accum.data[8] += data[2] * rhs.data[2];
        accum.data[9] += data[3] * rhs.data[0];
        accum.data[10] += data[3] * rhs.data[1];
        accum.data[11] += data[3] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 4-by-3-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> operator*(Matrix<Element, 1, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> product(
            Matrix<Element, 1, 4> const& rhs,
            Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[1] * rhs.data[0];
        accum.data[5] += data[1] * rhs.data[1];
        accum.data[6] += data[1] * rhs.data[2];
        accum.data[7] += data[1] * rhs.data[3];
        accum.data[8] += data[2] * rhs.data[0];
        accum.data[9] += data[2] * rhs.data[1];
        accum.data[10] += data[2] * rhs.data[2];
        accum.data[11] += data[2] * rhs.data[3];
        accum.data[12] += data[3] * rhs.data[0];
        accum.data[13] += data[3] * rhs.data[1];
        accum.data[14] += data[3] * rhs.data[2];
        accum.data[15] += data[3] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 4-by-4-by-1
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> operator*(Matrix<Element, 1, 4> const& rhs) const {
        return product(rhs);
    }

    /// Dot product of vectors with extent 4
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 4, 1> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        accum += data[3] * rhs.data[3];
        return accum;
    }

    /// Dot product of vectors with extent 4
    CUTLASS_HOST_DEVICE
    Element dot(Matrix<Element, 1, 4> const& rhs,
                Element accum = Element()) const {
        accum += data[0] * rhs.data[0];
        accum += data[1] * rhs.data[1];
        accum += data[2] * rhs.data[2];
        accum += data[3] * rhs.data[3];
        return accum;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];

        return accum;
    }
};

/// Template alias for 4-by-1 matrix
template <typename Element>
using Matrix4x1 = Matrix<Element, 4, 1>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix4x1<Element> make_Matrix4x1(Element _0_0,
                                                      Element _1_0,
                                                      Element _2_0,
                                                      Element _3_0) {
    return Matrix4x1<Element>(_0_0, _1_0, _2_0, _3_0);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 4-by-2 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 4, 2> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 4;

    /// Number of columns in matrix
    static int const kColumns = 2;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 8;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 4-by-2 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _1_0, Element _1_1, Element _2_0,
           Element _2_1, Element _3_0, Element _3_1) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _1_0;
        data[3] = _1_1;
        data[4] = _2_0;
        data[5] = _2_1;
        data[6] = _3_0;
        data[7] = _3_1;
    }

    /// Constucts a 4-by-2 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 2> const& row_0,
           Matrix<Element, 1, 2> const& row_1,
           Matrix<Element, 1, 2> const& row_2,
           Matrix<Element, 1, 2> const& row_3) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_1.data[0];
        data[3] = row_1.data[1];
        data[4] = row_2.data[0];
        data[5] = row_2.data[1];
        data[6] = row_3.data[0];
        data[7] = row_3.data[1];
    }

    /// Static method to construct a 4-by-2 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 2, 1> const& column_0,
                               Matrix<Element, 2, 1> const& column_1) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_0.data[1];
        result.data[3] = column_1.data[1];
        result.data[4] = column_0.data[2];
        result.data[5] = column_1.data[2];
        result.data[6] = column_0.data[3];
        result.data[7] = column_1.data[3];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 2, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 2> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> diagonal() const {
        Matrix<Element, 2, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[5];
        diag.data[2] = data[10];
        diag.data[3] = data[15];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> transpose() const {
        Matrix<Element, 2, 4> mt;

        mt.data[0] = data[0];
        mt.data[4] = data[1];
        mt.data[1] = data[2];
        mt.data[5] = data[3];
        mt.data[2] = data[4];
        mt.data[6] = data[5];
        mt.data[3] = data[6];
        mt.data[7] = data[7];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> row(int i) const { return slice_1x2(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 2> const& v, int i = 0) {
        return set_slice_1x2(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];
        m.data[2] = data[i * 2 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];
        data[i * 2 + j + 4] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];
        m.data[4] = data[i * 2 + j + 4];
        m.data[5] = data[i * 2 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];
        data[i * 2 + j + 4] = m.data[4];
        data[i * 2 + j + 5] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
        Matrix<Element, 4, 1> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 2];
        m.data[2] = data[i * 2 + j + 4];
        m.data[3] = data[i * 2 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x1(Matrix<Element, 4, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 2] = m.data[1];
        data[i * 2 + j + 4] = m.data[2];
        data[i * 2 + j + 6] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> column(int j) const { return slice_4x1(0, j); }

    Matrix& set_column(Matrix<Element, 4, 1> const& v, int j = 0) {
        return set_slice_4x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
        Matrix<Element, 4, 2> m;

        m.data[0] = data[i * 2 + j + 0];
        m.data[1] = data[i * 2 + j + 1];
        m.data[2] = data[i * 2 + j + 2];
        m.data[3] = data[i * 2 + j + 3];
        m.data[4] = data[i * 2 + j + 4];
        m.data[5] = data[i * 2 + j + 5];
        m.data[6] = data[i * 2 + j + 6];
        m.data[7] = data[i * 2 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x2(Matrix<Element, 4, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 2 + j + 0] = m.data[0];
        data[i * 2 + j + 1] = m.data[1];
        data[i * 2 + j + 2] = m.data[2];
        data[i * 2 + j + 3] = m.data[3];
        data[i * 2 + j + 4] = m.data[4];
        data[i * 2 + j + 5] = m.data[5];
        data[i * 2 + j + 6] = m.data[6];
        data[i * 2 + j + 7] = m.data[7];

        return *this;
    }

    /// Forms a 4-by-2 matrix by horizontally concatenating a 4-by-1 matrix with
    /// a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 1> const& lhs,
                       Matrix<Element, 4, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), lhs.at(1, 0), rhs.at(1, 0),
                      lhs.at(2, 0), rhs.at(2, 0), lhs.at(3, 0), rhs.at(3, 0));
    }

    /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 1> const& rhs) const {
        return Matrix<Element, 4, 3>::hcat(*this, rhs);
    }

    /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 2> const& rhs) const {
        return Matrix<Element, 4, 4>::hcat(*this, rhs);
    }

    /// Forms a 4-by-2 matrix by vertically concatenating a 1-by-2 matrix with a
    /// 3-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 2> const& upper,
                       Matrix<Element, 3, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), lower.at(0, 0),
                      lower.at(0, 1), lower.at(1, 0), lower.at(1, 1),
                      lower.at(2, 0), lower.at(2, 1));
    }

    /// Forms a 4-by-2 matrix by vertically concatenating a 2-by-2 matrix with a
    /// 2-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 2> const& upper,
                       Matrix<Element, 2, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(1, 0),
                      upper.at(1, 1), lower.at(0, 0), lower.at(0, 1),
                      lower.at(1, 0), lower.at(1, 1));
    }

    /// Forms a 4-by-2 matrix by vertically concatenating a 3-by-2 matrix with a
    /// 1-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 3, 2> const& upper,
                       Matrix<Element, 1, 2> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(1, 0),
                      upper.at(1, 1), upper.at(2, 0), upper.at(2, 1),
                      lower.at(0, 0), lower.at(0, 1));
    }

    /// Forms a 4-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Element B, Matrix<Element, 3, 1> const& C,
                        Matrix<Element, 3, 1> const& D) {
        return Matrix(A, B, C.at(0, 0), D.at(0, 0), C.at(1, 0), D.at(1, 0),
                      C.at(2, 0), D.at(2, 0));
    }

    /// Forms a 4-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 1> const& B,
                        Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), A.at(1, 0), B.at(1, 0),
                      C.at(0, 0), D.at(0, 0), C.at(1, 0), D.at(1, 0));
    }

    /// Forms a 4-by-2 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 1> const& A,
                        Matrix<Element, 3, 1> const& B, Element C, Element D) {
        return Matrix(A.at(0, 0), B.at(0, 0), A.at(1, 0), B.at(1, 0),
                      A.at(2, 0), B.at(2, 0), C, D);
    }

    /// Elementwise add operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];

        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];

        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];

        return result;
    }

    /// Elementwise add operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];

        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        data[4] += rhs.data[4];
        data[5] += rhs.data[5];

        data[6] += rhs.data[6];
        data[7] += rhs.data[7];

        return *this;
    }

    /// Elementwise subtract operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];

        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];

        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];

        return result;
    }

    /// Elementwise subtract operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];

        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];

        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];

        return *this;
    }

    /// Elementwise multiply operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];

        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];

        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];

        return result;
    }

    /// Scalar multiply operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;

        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;

        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;

        return result;
    }

    /// Scalar multiply operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;

        data[2] *= s;
        data[3] *= s;

        data[4] *= s;
        data[5] *= s;

        data[6] *= s;
        data[7] *= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];

        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];

        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];

        return result;
    }

    /// Scalar divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;

        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;

        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;

        return result;
    }

    /// Scalar divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;

        data[2] /= s;
        data[3] /= s;

        data[4] /= s;
        data[5] /= s;

        data[6] /= s;
        data[7] /= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (4-by-2)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];

        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];

        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];

        return m;
    }

    /// Matrix product of size 4-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> product(
            Matrix<Element, 2, 1> const& rhs,
            Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[2] * rhs.data[0];
        accum.data[2] += data[4] * rhs.data[0];
        accum.data[3] += data[6] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[3] * rhs.data[1];
        accum.data[2] += data[5] * rhs.data[1];
        accum.data[3] += data[7] * rhs.data[1];

        return accum;
    }

    /// Matrix product of size 4-by-1-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> operator*(Matrix<Element, 2, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> product(
            Matrix<Element, 2, 2> const& rhs,
            Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[2] * rhs.data[0];
        accum.data[3] += data[2] * rhs.data[1];
        accum.data[4] += data[4] * rhs.data[0];
        accum.data[5] += data[4] * rhs.data[1];
        accum.data[6] += data[6] * rhs.data[0];
        accum.data[7] += data[6] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[3] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[3];
        accum.data[4] += data[5] * rhs.data[2];
        accum.data[5] += data[5] * rhs.data[3];
        accum.data[6] += data[7] * rhs.data[2];
        accum.data[7] += data[7] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 4-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> operator*(Matrix<Element, 2, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-2-by-2
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 2, 2> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 4-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> product(
            Matrix<Element, 2, 3> const& rhs,
            Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[2] * rhs.data[0];
        accum.data[4] += data[2] * rhs.data[1];
        accum.data[5] += data[2] * rhs.data[2];
        accum.data[6] += data[4] * rhs.data[0];
        accum.data[7] += data[4] * rhs.data[1];
        accum.data[8] += data[4] * rhs.data[2];
        accum.data[9] += data[6] * rhs.data[0];
        accum.data[10] += data[6] * rhs.data[1];
        accum.data[11] += data[6] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[3] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];
        accum.data[6] += data[5] * rhs.data[3];
        accum.data[7] += data[5] * rhs.data[4];
        accum.data[8] += data[5] * rhs.data[5];
        accum.data[9] += data[7] * rhs.data[3];
        accum.data[10] += data[7] * rhs.data[4];
        accum.data[11] += data[7] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 4-by-3-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> operator*(Matrix<Element, 2, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> product(
            Matrix<Element, 2, 4> const& rhs,
            Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[2] * rhs.data[0];
        accum.data[5] += data[2] * rhs.data[1];
        accum.data[6] += data[2] * rhs.data[2];
        accum.data[7] += data[2] * rhs.data[3];
        accum.data[8] += data[4] * rhs.data[0];
        accum.data[9] += data[4] * rhs.data[1];
        accum.data[10] += data[4] * rhs.data[2];
        accum.data[11] += data[4] * rhs.data[3];
        accum.data[12] += data[6] * rhs.data[0];
        accum.data[13] += data[6] * rhs.data[1];
        accum.data[14] += data[6] * rhs.data[2];
        accum.data[15] += data[6] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[3] * rhs.data[4];
        accum.data[5] += data[3] * rhs.data[5];
        accum.data[6] += data[3] * rhs.data[6];
        accum.data[7] += data[3] * rhs.data[7];
        accum.data[8] += data[5] * rhs.data[4];
        accum.data[9] += data[5] * rhs.data[5];
        accum.data[10] += data[5] * rhs.data[6];
        accum.data[11] += data[5] * rhs.data[7];
        accum.data[12] += data[7] * rhs.data[4];
        accum.data[13] += data[7] * rhs.data[5];
        accum.data[14] += data[7] * rhs.data[6];
        accum.data[15] += data[7] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 4-by-4-by-2
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> operator*(Matrix<Element, 2, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[3];

        return accum;
    }
};

/// Template alias for 4-by-2 matrix
template <typename Element>
using Matrix4x2 = Matrix<Element, 4, 2>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix4x2<Element> make_Matrix4x2(
        Element _0_0, Element _0_1, Element _1_0, Element _1_1, Element _2_0,
        Element _2_1, Element _3_0, Element _3_1) {
    return Matrix4x2<Element>(_0_0, _0_1, _1_0, _1_1, _2_0, _2_1, _3_0, _3_1);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 4-by-3 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 4, 3> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 4;

    /// Number of columns in matrix
    static int const kColumns = 3;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 12;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 4-by-3 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
           Element _1_2, Element _2_0, Element _2_1, Element _2_2, Element _3_0,
           Element _3_1, Element _3_2) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _1_0;
        data[4] = _1_1;
        data[5] = _1_2;
        data[6] = _2_0;
        data[7] = _2_1;
        data[8] = _2_2;
        data[9] = _3_0;
        data[10] = _3_1;
        data[11] = _3_2;
    }

    /// Constucts a 4-by-3 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 3> const& row_0,
           Matrix<Element, 1, 3> const& row_1,
           Matrix<Element, 1, 3> const& row_2,
           Matrix<Element, 1, 3> const& row_3) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_1.data[0];
        data[4] = row_1.data[1];
        data[5] = row_1.data[2];
        data[6] = row_2.data[0];
        data[7] = row_2.data[1];
        data[8] = row_2.data[2];
        data[9] = row_3.data[0];
        data[10] = row_3.data[1];
        data[11] = row_3.data[2];
    }

    /// Static method to construct a 4-by-3 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 3, 1> const& column_0,
                               Matrix<Element, 3, 1> const& column_1,
                               Matrix<Element, 3, 1> const& column_2) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_0.data[1];
        result.data[4] = column_1.data[1];
        result.data[5] = column_2.data[1];
        result.data[6] = column_0.data[2];
        result.data[7] = column_1.data[2];
        result.data[8] = column_2.data[2];
        result.data[9] = column_0.data[3];
        result.data[10] = column_1.data[3];
        result.data[11] = column_2.data[3];
        return result;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;
        m.data[8] = s;
        m.data[9] = s;
        m.data[10] = s;
        m.data[11] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 3, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 3> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> diagonal() const {
        Matrix<Element, 3, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[5];
        diag.data[2] = data[10];
        diag.data[3] = data[15];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> transpose() const {
        Matrix<Element, 3, 4> mt;

        mt.data[0] = data[0];
        mt.data[4] = data[1];
        mt.data[8] = data[2];
        mt.data[1] = data[3];
        mt.data[5] = data[4];
        mt.data[9] = data[5];
        mt.data[2] = data[6];
        mt.data[6] = data[7];
        mt.data[10] = data[8];
        mt.data[3] = data[9];
        mt.data[7] = data[10];
        mt.data[11] = data[11];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> row(int i) const { return slice_1x3(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 3> const& v, int i = 0) {
        return set_slice_1x3(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];
        m.data[2] = data[i * 3 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];
        data[i * 3 + j + 6] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];
        m.data[4] = data[i * 3 + j + 6];
        m.data[5] = data[i * 3 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];
        data[i * 3 + j + 6] = m.data[4];
        data[i * 3 + j + 7] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
        Matrix<Element, 3, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];
        m.data[6] = data[i * 3 + j + 6];
        m.data[7] = data[i * 3 + j + 7];
        m.data[8] = data[i * 3 + j + 8];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x3(Matrix<Element, 3, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];
        data[i * 3 + j + 6] = m.data[6];
        data[i * 3 + j + 7] = m.data[7];
        data[i * 3 + j + 8] = m.data[8];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
        Matrix<Element, 4, 1> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 3];
        m.data[2] = data[i * 3 + j + 6];
        m.data[3] = data[i * 3 + j + 9];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x1(Matrix<Element, 4, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 3] = m.data[1];
        data[i * 3 + j + 6] = m.data[2];
        data[i * 3 + j + 9] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> column(int j) const { return slice_4x1(0, j); }

    Matrix& set_column(Matrix<Element, 4, 1> const& v, int j = 0) {
        return set_slice_4x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
        Matrix<Element, 4, 2> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 3];
        m.data[3] = data[i * 3 + j + 4];
        m.data[4] = data[i * 3 + j + 6];
        m.data[5] = data[i * 3 + j + 7];
        m.data[6] = data[i * 3 + j + 9];
        m.data[7] = data[i * 3 + j + 10];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x2(Matrix<Element, 4, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 3] = m.data[2];
        data[i * 3 + j + 4] = m.data[3];
        data[i * 3 + j + 6] = m.data[4];
        data[i * 3 + j + 7] = m.data[5];
        data[i * 3 + j + 9] = m.data[6];
        data[i * 3 + j + 10] = m.data[7];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
        Matrix<Element, 4, 3> m;

        m.data[0] = data[i * 3 + j + 0];
        m.data[1] = data[i * 3 + j + 1];
        m.data[2] = data[i * 3 + j + 2];
        m.data[3] = data[i * 3 + j + 3];
        m.data[4] = data[i * 3 + j + 4];
        m.data[5] = data[i * 3 + j + 5];
        m.data[6] = data[i * 3 + j + 6];
        m.data[7] = data[i * 3 + j + 7];
        m.data[8] = data[i * 3 + j + 8];
        m.data[9] = data[i * 3 + j + 9];
        m.data[10] = data[i * 3 + j + 10];
        m.data[11] = data[i * 3 + j + 11];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x3(Matrix<Element, 4, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 3 + j + 0] = m.data[0];
        data[i * 3 + j + 1] = m.data[1];
        data[i * 3 + j + 2] = m.data[2];
        data[i * 3 + j + 3] = m.data[3];
        data[i * 3 + j + 4] = m.data[4];
        data[i * 3 + j + 5] = m.data[5];
        data[i * 3 + j + 6] = m.data[6];
        data[i * 3 + j + 7] = m.data[7];
        data[i * 3 + j + 8] = m.data[8];
        data[i * 3 + j + 9] = m.data[9];
        data[i * 3 + j + 10] = m.data[10];
        data[i * 3 + j + 11] = m.data[11];

        return *this;
    }

    /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-1 matrix with
    /// a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 1> const& lhs,
                       Matrix<Element, 4, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), lhs.at(1, 0),
                      rhs.at(1, 0), rhs.at(1, 1), lhs.at(2, 0), rhs.at(2, 0),
                      rhs.at(2, 1), lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1));
    }

    /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-2 matrix with
    /// a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 2> const& lhs,
                       Matrix<Element, 4, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), lhs.at(1, 0),
                      lhs.at(1, 1), rhs.at(1, 0), lhs.at(2, 0), lhs.at(2, 1),
                      rhs.at(2, 0), lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0));
    }

    /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 1> const& rhs) const {
        return Matrix<Element, 4, 4>::hcat(*this, rhs);
    }

    /// Forms a 4-by-3 matrix by vertically concatenating a 1-by-3 matrix with a
    /// 3-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 3> const& upper,
                       Matrix<Element, 3, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2),
                      lower.at(1, 0), lower.at(1, 1), lower.at(1, 2),
                      lower.at(2, 0), lower.at(2, 1), lower.at(2, 2));
    }

    /// Forms a 4-by-3 matrix by vertically concatenating a 2-by-3 matrix with a
    /// 2-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 3> const& upper,
                       Matrix<Element, 2, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(1, 0), upper.at(1, 1), upper.at(1, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2),
                      lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
    }

    /// Forms a 4-by-3 matrix by vertically concatenating a 3-by-3 matrix with a
    /// 1-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 3, 3> const& upper,
                       Matrix<Element, 1, 3> const& lower) {
        return Matrix(upper.at(0, 0), upper.at(0, 1), upper.at(0, 2),
                      upper.at(1, 0), upper.at(1, 1), upper.at(1, 2),
                      upper.at(2, 0), upper.at(2, 1), upper.at(2, 2),
                      lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 2> const& B,
                        Matrix<Element, 3, 1> const& C,
                        Matrix<Element, 3, 2> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), C.at(0, 0), D.at(0, 0),
                      D.at(0, 1), C.at(1, 0), D.at(1, 0), D.at(1, 1),
                      C.at(2, 0), D.at(2, 0), D.at(2, 1));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A, Element B,
                        Matrix<Element, 3, 2> const& C,
                        Matrix<Element, 3, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B, C.at(0, 0), C.at(0, 1),
                      D.at(0, 0), C.at(1, 0), C.at(1, 1), D.at(1, 0),
                      C.at(2, 0), C.at(2, 1), D.at(2, 0));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 2> const& B,
                        Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 2> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), A.at(1, 0),
                      B.at(1, 0), B.at(1, 1), C.at(0, 0), D.at(0, 0),
                      D.at(0, 1), C.at(1, 0), D.at(1, 0), D.at(1, 1));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 2> const& A,
                        Matrix<Element, 2, 1> const& B,
                        Matrix<Element, 2, 2> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), A.at(1, 0),
                      A.at(1, 1), B.at(1, 0), C.at(0, 0), C.at(0, 1),
                      D.at(0, 0), C.at(1, 0), C.at(1, 1), D.at(1, 0));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 1> const& A,
                        Matrix<Element, 3, 2> const& B, Element C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), A.at(1, 0),
                      B.at(1, 0), B.at(1, 1), A.at(2, 0), B.at(2, 0),
                      B.at(2, 1), C, D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 4-by-3 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 2> const& A,
                        Matrix<Element, 3, 1> const& B,
                        Matrix<Element, 1, 2> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), A.at(1, 0),
                      A.at(1, 1), B.at(1, 0), A.at(2, 0), A.at(2, 1),
                      B.at(2, 0), C.at(0, 0), C.at(0, 1), D);
    }

    /// Elementwise add operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];

        result.data[3] = data[3] + rhs.data[3];
        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];

        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];
        result.data[8] = data[8] + rhs.data[8];

        result.data[9] = data[9] + rhs.data[9];
        result.data[10] = data[10] + rhs.data[10];
        result.data[11] = data[11] + rhs.data[11];

        return result;
    }

    /// Elementwise add operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];

        data[3] += rhs.data[3];
        data[4] += rhs.data[4];
        data[5] += rhs.data[5];

        data[6] += rhs.data[6];
        data[7] += rhs.data[7];
        data[8] += rhs.data[8];

        data[9] += rhs.data[9];
        data[10] += rhs.data[10];
        data[11] += rhs.data[11];

        return *this;
    }

    /// Elementwise subtract operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];

        result.data[3] = data[3] - rhs.data[3];
        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];

        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];
        result.data[8] = data[8] - rhs.data[8];

        result.data[9] = data[9] - rhs.data[9];
        result.data[10] = data[10] - rhs.data[10];
        result.data[11] = data[11] - rhs.data[11];

        return result;
    }

    /// Elementwise subtract operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];

        data[3] -= rhs.data[3];
        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];

        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];
        data[8] -= rhs.data[8];

        data[9] -= rhs.data[9];
        data[10] -= rhs.data[10];
        data[11] -= rhs.data[11];

        return *this;
    }

    /// Elementwise multiply operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];

        result.data[3] = data[3] * rhs.data[3];
        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];

        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];
        result.data[8] = data[8] * rhs.data[8];

        result.data[9] = data[9] * rhs.data[9];
        result.data[10] = data[10] * rhs.data[10];
        result.data[11] = data[11] * rhs.data[11];

        return result;
    }

    /// Scalar multiply operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;

        result.data[3] = data[3] * s;
        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;

        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;
        result.data[8] = data[8] * s;

        result.data[9] = data[9] * s;
        result.data[10] = data[10] * s;
        result.data[11] = data[11] * s;

        return result;
    }

    /// Scalar multiply operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;

        data[3] *= s;
        data[4] *= s;
        data[5] *= s;

        data[6] *= s;
        data[7] *= s;
        data[8] *= s;

        data[9] *= s;
        data[10] *= s;
        data[11] *= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];

        result.data[3] = data[3] / rhs.data[3];
        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];

        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];
        result.data[8] = data[8] / rhs.data[8];

        result.data[9] = data[9] / rhs.data[9];
        result.data[10] = data[10] / rhs.data[10];
        result.data[11] = data[11] / rhs.data[11];

        return result;
    }

    /// Scalar divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;

        result.data[3] = data[3] / s;
        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;

        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;
        result.data[8] = data[8] / s;

        result.data[9] = data[9] / s;
        result.data[10] = data[10] / s;
        result.data[11] = data[11] / s;

        return result;
    }

    /// Scalar divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;

        data[3] /= s;
        data[4] /= s;
        data[5] /= s;

        data[6] /= s;
        data[7] /= s;
        data[8] /= s;

        data[9] /= s;
        data[10] /= s;
        data[11] /= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (4-by-3)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];

        data[3] /= rhs.data[3];
        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];

        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];
        data[8] /= rhs.data[8];

        data[9] /= rhs.data[9];
        data[10] /= rhs.data[10];
        data[11] /= rhs.data[11];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];
        m.data[8] = -m.data[8];
        m.data[9] = -m.data[9];
        m.data[10] = -m.data[10];
        m.data[11] = -m.data[11];

        return m;
    }

    /// Matrix product of size 4-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> product(
            Matrix<Element, 3, 1> const& rhs,
            Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[3] * rhs.data[0];
        accum.data[2] += data[6] * rhs.data[0];
        accum.data[3] += data[9] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[4] * rhs.data[1];
        accum.data[2] += data[7] * rhs.data[1];
        accum.data[3] += data[10] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[5] * rhs.data[2];
        accum.data[2] += data[8] * rhs.data[2];
        accum.data[3] += data[11] * rhs.data[2];

        return accum;
    }

    /// Matrix product of size 4-by-1-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> operator*(Matrix<Element, 3, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> product(
            Matrix<Element, 3, 2> const& rhs,
            Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[3] * rhs.data[0];
        accum.data[3] += data[3] * rhs.data[1];
        accum.data[4] += data[6] * rhs.data[0];
        accum.data[5] += data[6] * rhs.data[1];
        accum.data[6] += data[9] * rhs.data[0];
        accum.data[7] += data[9] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[4] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[3];
        accum.data[4] += data[7] * rhs.data[2];
        accum.data[5] += data[7] * rhs.data[3];
        accum.data[6] += data[10] * rhs.data[2];
        accum.data[7] += data[10] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[5] * rhs.data[4];
        accum.data[3] += data[5] * rhs.data[5];
        accum.data[4] += data[8] * rhs.data[4];
        accum.data[5] += data[8] * rhs.data[5];
        accum.data[6] += data[11] * rhs.data[4];
        accum.data[7] += data[11] * rhs.data[5];

        return accum;
    }

    /// Matrix product of size 4-by-2-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> operator*(Matrix<Element, 3, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> product(
            Matrix<Element, 3, 3> const& rhs,
            Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[3] * rhs.data[0];
        accum.data[4] += data[3] * rhs.data[1];
        accum.data[5] += data[3] * rhs.data[2];
        accum.data[6] += data[6] * rhs.data[0];
        accum.data[7] += data[6] * rhs.data[1];
        accum.data[8] += data[6] * rhs.data[2];
        accum.data[9] += data[9] * rhs.data[0];
        accum.data[10] += data[9] * rhs.data[1];
        accum.data[11] += data[9] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[4] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];
        accum.data[6] += data[7] * rhs.data[3];
        accum.data[7] += data[7] * rhs.data[4];
        accum.data[8] += data[7] * rhs.data[5];
        accum.data[9] += data[10] * rhs.data[3];
        accum.data[10] += data[10] * rhs.data[4];
        accum.data[11] += data[10] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[5] * rhs.data[6];
        accum.data[4] += data[5] * rhs.data[7];
        accum.data[5] += data[5] * rhs.data[8];
        accum.data[6] += data[8] * rhs.data[6];
        accum.data[7] += data[8] * rhs.data[7];
        accum.data[8] += data[8] * rhs.data[8];
        accum.data[9] += data[11] * rhs.data[6];
        accum.data[10] += data[11] * rhs.data[7];
        accum.data[11] += data[11] * rhs.data[8];

        return accum;
    }

    /// Matrix product of size 4-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> operator*(Matrix<Element, 3, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-3-by-3
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 3, 3> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Matrix product of size 4-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> product(
            Matrix<Element, 3, 4> const& rhs,
            Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[3] * rhs.data[0];
        accum.data[5] += data[3] * rhs.data[1];
        accum.data[6] += data[3] * rhs.data[2];
        accum.data[7] += data[3] * rhs.data[3];
        accum.data[8] += data[6] * rhs.data[0];
        accum.data[9] += data[6] * rhs.data[1];
        accum.data[10] += data[6] * rhs.data[2];
        accum.data[11] += data[6] * rhs.data[3];
        accum.data[12] += data[9] * rhs.data[0];
        accum.data[13] += data[9] * rhs.data[1];
        accum.data[14] += data[9] * rhs.data[2];
        accum.data[15] += data[9] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[4] * rhs.data[4];
        accum.data[5] += data[4] * rhs.data[5];
        accum.data[6] += data[4] * rhs.data[6];
        accum.data[7] += data[4] * rhs.data[7];
        accum.data[8] += data[7] * rhs.data[4];
        accum.data[9] += data[7] * rhs.data[5];
        accum.data[10] += data[7] * rhs.data[6];
        accum.data[11] += data[7] * rhs.data[7];
        accum.data[12] += data[10] * rhs.data[4];
        accum.data[13] += data[10] * rhs.data[5];
        accum.data[14] += data[10] * rhs.data[6];
        accum.data[15] += data[10] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[5] * rhs.data[8];
        accum.data[5] += data[5] * rhs.data[9];
        accum.data[6] += data[5] * rhs.data[10];
        accum.data[7] += data[5] * rhs.data[11];
        accum.data[8] += data[8] * rhs.data[8];
        accum.data[9] += data[8] * rhs.data[9];
        accum.data[10] += data[8] * rhs.data[10];
        accum.data[11] += data[8] * rhs.data[11];
        accum.data[12] += data[11] * rhs.data[8];
        accum.data[13] += data[11] * rhs.data[9];
        accum.data[14] += data[11] * rhs.data[10];
        accum.data[15] += data[11] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 4-by-4-by-3
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> operator*(Matrix<Element, 3, 4> const& rhs) const {
        return product(rhs);
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];
        accum += data[8];
        accum += data[9];
        accum += data[10];
        accum += data[11];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];
        accum += data[8] * data[8];
        accum += data[9] * data[9];
        accum += data[10] * data[10];
        accum += data[11] * data[11];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[4];
        accum += data[8];

        return accum;
    }
};

/// Template alias for 4-by-3 matrix
template <typename Element>
using Matrix4x3 = Matrix<Element, 4, 3>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix4x3<Element> make_Matrix4x3(
        Element _0_0, Element _0_1, Element _0_2, Element _1_0, Element _1_1,
        Element _1_2, Element _2_0, Element _2_1, Element _2_2, Element _3_0,
        Element _3_1, Element _3_2) {
    return Matrix4x3<Element>(_0_0, _0_1, _0_2, _1_0, _1_1, _1_2, _2_0, _2_1,
                              _2_2, _3_0, _3_1, _3_2);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// 4-by-4 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 4, 4> {
    //
    // Type definitions
    //

    /// Element data type
    using Element = Element_;

    /// Number of rows in matrix
    static int const kRows = 4;

    /// Number of columns in matrix
    static int const kColumns = 4;

    /// Layout of matrix in underlying array
    using Layout = layout::RowMajor;

    /// Number of elements in matrix
    static int const kCount = 16;

    //
    // Data members
    //

    /// Elements of the matrix in row-major layout
    Array<Element, kCount> data;

    //
    // Methods
    //

    /// Constructs a zero matrix
    CUTLASS_HOST_DEVICE
    Matrix() { data.clear(); }

    /// Copy constructor for a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Matrix(Matrix const& rhs) { data = rhs.data; }

    /// Constucts a 4-by-4 matrix from scalar elements
    CUTLASS_HOST_DEVICE
    Matrix(Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
           Element _1_1, Element _1_2, Element _1_3, Element _2_0, Element _2_1,
           Element _2_2, Element _2_3, Element _3_0, Element _3_1, Element _3_2,
           Element _3_3) {
        data[0] = _0_0;
        data[1] = _0_1;
        data[2] = _0_2;
        data[3] = _0_3;
        data[4] = _1_0;
        data[5] = _1_1;
        data[6] = _1_2;
        data[7] = _1_3;
        data[8] = _2_0;
        data[9] = _2_1;
        data[10] = _2_2;
        data[11] = _2_3;
        data[12] = _3_0;
        data[13] = _3_1;
        data[14] = _3_2;
        data[15] = _3_3;
    }

    /// Constucts a 4-by-4 matrix from row vectors
    CUTLASS_HOST_DEVICE
    Matrix(Matrix<Element, 1, 4> const& row_0,
           Matrix<Element, 1, 4> const& row_1,
           Matrix<Element, 1, 4> const& row_2,
           Matrix<Element, 1, 4> const& row_3) {
        data[0] = row_0.data[0];
        data[1] = row_0.data[1];
        data[2] = row_0.data[2];
        data[3] = row_0.data[3];
        data[4] = row_1.data[0];
        data[5] = row_1.data[1];
        data[6] = row_1.data[2];
        data[7] = row_1.data[3];
        data[8] = row_2.data[0];
        data[9] = row_2.data[1];
        data[10] = row_2.data[2];
        data[11] = row_2.data[3];
        data[12] = row_3.data[0];
        data[13] = row_3.data[1];
        data[14] = row_3.data[2];
        data[15] = row_3.data[3];
    }

    /// Static method to construct a 4-by-4 matrix from column vectors
    CUTLASS_HOST_DEVICE
    static Matrix from_columns(Matrix<Element, 4, 1> const& column_0,
                               Matrix<Element, 4, 1> const& column_1,
                               Matrix<Element, 4, 1> const& column_2,
                               Matrix<Element, 4, 1> const& column_3) {
        Matrix result;

        result.data[0] = column_0.data[0];
        result.data[1] = column_1.data[0];
        result.data[2] = column_2.data[0];
        result.data[3] = column_3.data[0];
        result.data[4] = column_0.data[1];
        result.data[5] = column_1.data[1];
        result.data[6] = column_2.data[1];
        result.data[7] = column_3.data[1];
        result.data[8] = column_0.data[2];
        result.data[9] = column_1.data[2];
        result.data[10] = column_2.data[2];
        result.data[11] = column_3.data[2];
        result.data[12] = column_0.data[3];
        result.data[13] = column_1.data[3];
        result.data[14] = column_2.data[3];
        result.data[15] = column_3.data[3];
        return result;
    }

    /// Constructs an identity matrix
    CUTLASS_HOST_DEVICE
    static Matrix identity() {
        Matrix m;

        m.data[0] = Element(1);
        m.data[5] = Element(1);
        m.data[10] = Element(1);
        m.data[15] = Element(1);

        return m;
    }

    /// Constructs a matrix from a uniform element
    CUTLASS_HOST_DEVICE
    static Matrix uniform(Element s) {
        Matrix m;

        m.data[0] = s;
        m.data[1] = s;
        m.data[2] = s;
        m.data[3] = s;
        m.data[4] = s;
        m.data[5] = s;
        m.data[6] = s;
        m.data[7] = s;
        m.data[8] = s;
        m.data[9] = s;
        m.data[10] = s;
        m.data[11] = s;
        m.data[12] = s;
        m.data[13] = s;
        m.data[14] = s;
        m.data[15] = s;

        return m;
    }

    /// Constructs a matrix from a uniform element 1
    CUTLASS_HOST_DEVICE
    static Matrix ones() { return uniform(Element(1)); }

    /// Constructs a matrix from a uniform element 0
    CUTLASS_HOST_DEVICE
    static Matrix zero() { return Matrix(); }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 4, 1> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Constructs a matrix from elements along its diagonal
    CUTLASS_HOST_DEVICE
    static Matrix from_diagonal(Matrix<Element, 1, 4> const& diag) {
        Matrix m;

        m.data[0] = diag.data[0];
        m.data[5] = diag.data[1];
        m.data[10] = diag.data[2];
        m.data[15] = diag.data[3];

        return m;
    }

    /// Gets an array of diagonal elements
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> diagonal() const {
        Matrix<Element, 4, 1> diag;

        diag.data[0] = data[0];
        diag.data[1] = data[5];
        diag.data[2] = data[10];
        diag.data[3] = data[15];

        return diag;
    }

    /// Returns a transposed matrix
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> transpose() const {
        Matrix<Element, 4, 4> mt;

        mt.data[0] = data[0];
        mt.data[4] = data[1];
        mt.data[8] = data[2];
        mt.data[12] = data[3];
        mt.data[1] = data[4];
        mt.data[5] = data[5];
        mt.data[9] = data[6];
        mt.data[13] = data[7];
        mt.data[2] = data[8];
        mt.data[6] = data[9];
        mt.data[10] = data[10];
        mt.data[14] = data[11];
        mt.data[3] = data[12];
        mt.data[7] = data[13];
        mt.data[11] = data[14];
        mt.data[15] = data[15];

        return mt;
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(int i, int j) const { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(int i, int j) { return data[i * 4 + j]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element at(Coord<2> const& coord) const { return at(coord[0], coord[1]); }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& at(Coord<2> const& coord) { return at(coord[0], coord[1]); }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& at(int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element at(int offset) const { return data[offset]; }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element operator[](Coord<2> const& coord) const {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by coordinate
    CUTLASS_HOST_DEVICE
    Element& operator[](Coord<2> const& coord) {
        return at(coord[0], coord[1]);
    }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element& operator[](int offset) { return data[offset]; }

    /// Accesses an element by offset
    CUTLASS_HOST_DEVICE
    Element operator[](int offset) const { return data[offset]; }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
        Matrix<Element, 1, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x2(Matrix<Element, 1, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
        Matrix<Element, 1, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x3(Matrix<Element, 1, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
        Matrix<Element, 1, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_1x4(Matrix<Element, 1, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 1, 4> row(int i) const { return slice_1x4(i, 0); }

    Matrix& set_row(Matrix<Element, 1, 4> const& v, int i = 0) {
        return set_slice_1x4(v, i, 0);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
        Matrix<Element, 2, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x1(Matrix<Element, 2, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
        Matrix<Element, 2, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x2(Matrix<Element, 2, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
        Matrix<Element, 2, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x3(Matrix<Element, 2, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
        Matrix<Element, 2, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_2x4(Matrix<Element, 2, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
        Matrix<Element, 3, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];
        m.data[2] = data[i * 4 + j + 8];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x1(Matrix<Element, 3, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];
        data[i * 4 + j + 8] = m.data[2];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
        Matrix<Element, 3, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];
        m.data[4] = data[i * 4 + j + 8];
        m.data[5] = data[i * 4 + j + 9];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x2(Matrix<Element, 3, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];
        data[i * 4 + j + 8] = m.data[4];
        data[i * 4 + j + 9] = m.data[5];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
        Matrix<Element, 3, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];
        m.data[6] = data[i * 4 + j + 8];
        m.data[7] = data[i * 4 + j + 9];
        m.data[8] = data[i * 4 + j + 10];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x3(Matrix<Element, 3, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];
        data[i * 4 + j + 8] = m.data[6];
        data[i * 4 + j + 9] = m.data[7];
        data[i * 4 + j + 10] = m.data[8];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
        Matrix<Element, 3, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];
        m.data[8] = data[i * 4 + j + 8];
        m.data[9] = data[i * 4 + j + 9];
        m.data[10] = data[i * 4 + j + 10];
        m.data[11] = data[i * 4 + j + 11];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_3x4(Matrix<Element, 3, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];
        data[i * 4 + j + 8] = m.data[8];
        data[i * 4 + j + 9] = m.data[9];
        data[i * 4 + j + 10] = m.data[10];
        data[i * 4 + j + 11] = m.data[11];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
        Matrix<Element, 4, 1> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 4];
        m.data[2] = data[i * 4 + j + 8];
        m.data[3] = data[i * 4 + j + 12];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x1(Matrix<Element, 4, 1> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 4] = m.data[1];
        data[i * 4 + j + 8] = m.data[2];
        data[i * 4 + j + 12] = m.data[3];

        return *this;
    }

    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> column(int j) const { return slice_4x1(0, j); }

    Matrix& set_column(Matrix<Element, 4, 1> const& v, int j = 0) {
        return set_slice_4x1(v, 0, j);
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
        Matrix<Element, 4, 2> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 4];
        m.data[3] = data[i * 4 + j + 5];
        m.data[4] = data[i * 4 + j + 8];
        m.data[5] = data[i * 4 + j + 9];
        m.data[6] = data[i * 4 + j + 12];
        m.data[7] = data[i * 4 + j + 13];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x2(Matrix<Element, 4, 2> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 4] = m.data[2];
        data[i * 4 + j + 5] = m.data[3];
        data[i * 4 + j + 8] = m.data[4];
        data[i * 4 + j + 9] = m.data[5];
        data[i * 4 + j + 12] = m.data[6];
        data[i * 4 + j + 13] = m.data[7];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
        Matrix<Element, 4, 3> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 4];
        m.data[4] = data[i * 4 + j + 5];
        m.data[5] = data[i * 4 + j + 6];
        m.data[6] = data[i * 4 + j + 8];
        m.data[7] = data[i * 4 + j + 9];
        m.data[8] = data[i * 4 + j + 10];
        m.data[9] = data[i * 4 + j + 12];
        m.data[10] = data[i * 4 + j + 13];
        m.data[11] = data[i * 4 + j + 14];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x3(Matrix<Element, 4, 3> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 4] = m.data[3];
        data[i * 4 + j + 5] = m.data[4];
        data[i * 4 + j + 6] = m.data[5];
        data[i * 4 + j + 8] = m.data[6];
        data[i * 4 + j + 9] = m.data[7];
        data[i * 4 + j + 10] = m.data[8];
        data[i * 4 + j + 12] = m.data[9];
        data[i * 4 + j + 13] = m.data[10];
        data[i * 4 + j + 14] = m.data[11];

        return *this;
    }

    /// Gets a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> slice_4x4(int i = 0, int j = 0) const {
        Matrix<Element, 4, 4> m;

        m.data[0] = data[i * 4 + j + 0];
        m.data[1] = data[i * 4 + j + 1];
        m.data[2] = data[i * 4 + j + 2];
        m.data[3] = data[i * 4 + j + 3];
        m.data[4] = data[i * 4 + j + 4];
        m.data[5] = data[i * 4 + j + 5];
        m.data[6] = data[i * 4 + j + 6];
        m.data[7] = data[i * 4 + j + 7];
        m.data[8] = data[i * 4 + j + 8];
        m.data[9] = data[i * 4 + j + 9];
        m.data[10] = data[i * 4 + j + 10];
        m.data[11] = data[i * 4 + j + 11];
        m.data[12] = data[i * 4 + j + 12];
        m.data[13] = data[i * 4 + j + 13];
        m.data[14] = data[i * 4 + j + 14];
        m.data[15] = data[i * 4 + j + 15];

        return m;
    }

    /// Overwrites a submatrix with optional offset
    CUTLASS_HOST_DEVICE
    Matrix& set_slice_4x4(Matrix<Element, 4, 4> const& m, int i = 0,
                          int j = 0) {
        data[i * 4 + j + 0] = m.data[0];
        data[i * 4 + j + 1] = m.data[1];
        data[i * 4 + j + 2] = m.data[2];
        data[i * 4 + j + 3] = m.data[3];
        data[i * 4 + j + 4] = m.data[4];
        data[i * 4 + j + 5] = m.data[5];
        data[i * 4 + j + 6] = m.data[6];
        data[i * 4 + j + 7] = m.data[7];
        data[i * 4 + j + 8] = m.data[8];
        data[i * 4 + j + 9] = m.data[9];
        data[i * 4 + j + 10] = m.data[10];
        data[i * 4 + j + 11] = m.data[11];
        data[i * 4 + j + 12] = m.data[12];
        data[i * 4 + j + 13] = m.data[13];
        data[i * 4 + j + 14] = m.data[14];
        data[i * 4 + j + 15] = m.data[15];

        return *this;
    }

    /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-1 matrix with
    /// a 4-by-3 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 1> const& lhs,
                       Matrix<Element, 4, 3> const& rhs) {
        return Matrix(lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2),
                      lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2),
                      lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2),
                      lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1), rhs.at(3, 2));
    }

    /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-2 matrix with
    /// a 4-by-2 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 2> const& lhs,
                       Matrix<Element, 4, 2> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1),
                      lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1),
                      lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1),
                      lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0), rhs.at(3, 1));
    }

    /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-3 matrix with
    /// a 4-by-1 matrix
    CUTLASS_HOST_DEVICE
    static Matrix hcat(Matrix<Element, 4, 3> const& lhs,
                       Matrix<Element, 4, 1> const& rhs) {
        return Matrix(lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0),
                      lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0),
                      lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0),
                      lhs.at(3, 0), lhs.at(3, 1), lhs.at(3, 2), rhs.at(3, 0));
    }

    /// Forms a 4-by-4 matrix by vertically concatenating a 1-by-4 matrix with a
    /// 3-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 1, 4> const& upper,
                       Matrix<Element, 3, 4> const& lower) {
        return Matrix(
                upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3),
                lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3),
                lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3),
                lower.at(2, 0), lower.at(2, 1), lower.at(2, 2), lower.at(2, 3));
    }

    /// Forms a 4-by-4 matrix by vertically concatenating a 2-by-4 matrix with a
    /// 2-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 2, 4> const& upper,
                       Matrix<Element, 2, 4> const& lower) {
        return Matrix(
                upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3),
                upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3),
                lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3),
                lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
    }

    /// Forms a 4-by-4 matrix by vertically concatenating a 3-by-4 matrix with a
    /// 1-by-4 matrix
    CUTLASS_HOST_DEVICE
    static Matrix vcat(Matrix<Element, 3, 4> const& upper,
                       Matrix<Element, 1, 4> const& lower) {
        return Matrix(
                upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3),
                upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3),
                upper.at(2, 0), upper.at(2, 1), upper.at(2, 2), upper.at(2, 3),
                lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Element A, Matrix<Element, 1, 3> const& B,
                        Matrix<Element, 3, 1> const& C,
                        Matrix<Element, 3, 3> const& D) {
        return Matrix(A, B.at(0, 0), B.at(0, 1), B.at(0, 2), C.at(0, 0),
                      D.at(0, 0), D.at(0, 1), D.at(0, 2), C.at(1, 0),
                      D.at(1, 0), D.at(1, 1), D.at(1, 2), C.at(2, 0),
                      D.at(2, 0), D.at(2, 1), D.at(2, 2));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 2> const& A,
                        Matrix<Element, 1, 2> const& B,
                        Matrix<Element, 3, 2> const& C,
                        Matrix<Element, 3, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1),
                      C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1),
                      C.at(2, 0), C.at(2, 1), D.at(2, 0), D.at(2, 1));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 1, 3> const& A, Element B,
                        Matrix<Element, 3, 3> const& C,
                        Matrix<Element, 3, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B, C.at(0, 0),
                      C.at(0, 1), C.at(0, 2), D.at(0, 0), C.at(1, 0),
                      C.at(1, 1), C.at(1, 2), D.at(1, 0), C.at(2, 0),
                      C.at(2, 1), C.at(2, 2), D.at(2, 0));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 1> const& A,
                        Matrix<Element, 2, 3> const& B,
                        Matrix<Element, 2, 1> const& C,
                        Matrix<Element, 2, 3> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2),
                      A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2),
                      C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2),
                      C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 2> const& A,
                        Matrix<Element, 2, 2> const& B,
                        Matrix<Element, 2, 2> const& C,
                        Matrix<Element, 2, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1),
                      C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 2, 3> const& A,
                        Matrix<Element, 2, 1> const& B,
                        Matrix<Element, 2, 3> const& C,
                        Matrix<Element, 2, 1> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0),
                      A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0),
                      C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0),
                      C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 1> const& A,
                        Matrix<Element, 3, 3> const& B, Element C,
                        Matrix<Element, 1, 3> const& D) {
        return Matrix(A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2),
                      A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2),
                      A.at(2, 0), B.at(2, 0), B.at(2, 1), B.at(2, 2), C,
                      D.at(0, 0), D.at(0, 1), D.at(0, 2));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 2> const& A,
                        Matrix<Element, 3, 2> const& B,
                        Matrix<Element, 1, 2> const& C,
                        Matrix<Element, 1, 2> const& D) {
        return Matrix(A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1),
                      A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1),
                      A.at(2, 0), A.at(2, 1), B.at(2, 0), B.at(2, 1),
                      C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1));
    }

    /// Forms a 4-by-4 matrix by concatenating four components
    CUTLASS_HOST_DEVICE
    static Matrix block(Matrix<Element, 3, 3> const& A,
                        Matrix<Element, 3, 1> const& B,
                        Matrix<Element, 1, 3> const& C, Element D) {
        return Matrix(A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0),
                      A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0),
                      A.at(2, 0), A.at(2, 1), A.at(2, 2), B.at(2, 0),
                      C.at(0, 0), C.at(0, 1), C.at(0, 2), D);
    }

    /// Elementwise add operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix add(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] + rhs.data[0];
        result.data[1] = data[1] + rhs.data[1];
        result.data[2] = data[2] + rhs.data[2];
        result.data[3] = data[3] + rhs.data[3];

        result.data[4] = data[4] + rhs.data[4];
        result.data[5] = data[5] + rhs.data[5];
        result.data[6] = data[6] + rhs.data[6];
        result.data[7] = data[7] + rhs.data[7];

        result.data[8] = data[8] + rhs.data[8];
        result.data[9] = data[9] + rhs.data[9];
        result.data[10] = data[10] + rhs.data[10];
        result.data[11] = data[11] + rhs.data[11];

        result.data[12] = data[12] + rhs.data[12];
        result.data[13] = data[13] + rhs.data[13];
        result.data[14] = data[14] + rhs.data[14];
        result.data[15] = data[15] + rhs.data[15];

        return result;
    }

    /// Elementwise add operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator+(Matrix const& rhs) const { return add(rhs); }

    /// Elementwise add operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator+=(Matrix const& rhs) {
        data[0] += rhs.data[0];
        data[1] += rhs.data[1];
        data[2] += rhs.data[2];
        data[3] += rhs.data[3];

        data[4] += rhs.data[4];
        data[5] += rhs.data[5];
        data[6] += rhs.data[6];
        data[7] += rhs.data[7];

        data[8] += rhs.data[8];
        data[9] += rhs.data[9];
        data[10] += rhs.data[10];
        data[11] += rhs.data[11];

        data[12] += rhs.data[12];
        data[13] += rhs.data[13];
        data[14] += rhs.data[14];
        data[15] += rhs.data[15];

        return *this;
    }

    /// Elementwise subtract operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix subtract(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] - rhs.data[0];
        result.data[1] = data[1] - rhs.data[1];
        result.data[2] = data[2] - rhs.data[2];
        result.data[3] = data[3] - rhs.data[3];

        result.data[4] = data[4] - rhs.data[4];
        result.data[5] = data[5] - rhs.data[5];
        result.data[6] = data[6] - rhs.data[6];
        result.data[7] = data[7] - rhs.data[7];

        result.data[8] = data[8] - rhs.data[8];
        result.data[9] = data[9] - rhs.data[9];
        result.data[10] = data[10] - rhs.data[10];
        result.data[11] = data[11] - rhs.data[11];

        result.data[12] = data[12] - rhs.data[12];
        result.data[13] = data[13] - rhs.data[13];
        result.data[14] = data[14] - rhs.data[14];
        result.data[15] = data[15] - rhs.data[15];

        return result;
    }

    /// Elementwise subtract operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator-(Matrix const& rhs) const { return subtract(rhs); }

    /// Elementwise subtract operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator-=(Matrix const& rhs) {
        data[0] -= rhs.data[0];
        data[1] -= rhs.data[1];
        data[2] -= rhs.data[2];
        data[3] -= rhs.data[3];

        data[4] -= rhs.data[4];
        data[5] -= rhs.data[5];
        data[6] -= rhs.data[6];
        data[7] -= rhs.data[7];

        data[8] -= rhs.data[8];
        data[9] -= rhs.data[9];
        data[10] -= rhs.data[10];
        data[11] -= rhs.data[11];

        data[12] -= rhs.data[12];
        data[13] -= rhs.data[13];
        data[14] -= rhs.data[14];
        data[15] -= rhs.data[15];

        return *this;
    }

    /// Elementwise multiply operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] * rhs.data[0];
        result.data[1] = data[1] * rhs.data[1];
        result.data[2] = data[2] * rhs.data[2];
        result.data[3] = data[3] * rhs.data[3];

        result.data[4] = data[4] * rhs.data[4];
        result.data[5] = data[5] * rhs.data[5];
        result.data[6] = data[6] * rhs.data[6];
        result.data[7] = data[7] * rhs.data[7];

        result.data[8] = data[8] * rhs.data[8];
        result.data[9] = data[9] * rhs.data[9];
        result.data[10] = data[10] * rhs.data[10];
        result.data[11] = data[11] * rhs.data[11];

        result.data[12] = data[12] * rhs.data[12];
        result.data[13] = data[13] * rhs.data[13];
        result.data[14] = data[14] * rhs.data[14];
        result.data[15] = data[15] * rhs.data[15];

        return result;
    }

    /// Scalar multiply operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix multiply(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] * s;
        result.data[1] = data[1] * s;
        result.data[2] = data[2] * s;
        result.data[3] = data[3] * s;

        result.data[4] = data[4] * s;
        result.data[5] = data[5] * s;
        result.data[6] = data[6] * s;
        result.data[7] = data[7] * s;

        result.data[8] = data[8] * s;
        result.data[9] = data[9] * s;
        result.data[10] = data[10] * s;
        result.data[11] = data[11] * s;

        result.data[12] = data[12] * s;
        result.data[13] = data[13] * s;
        result.data[14] = data[14] * s;
        result.data[15] = data[15] * s;

        return result;
    }

    /// Scalar multiply operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator*(Element const& s) const { return multiply(s); }

    /// Scalar multiply operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Element const& s) {
        data[0] *= s;
        data[1] *= s;
        data[2] *= s;
        data[3] *= s;

        data[4] *= s;
        data[5] *= s;
        data[6] *= s;
        data[7] *= s;

        data[8] *= s;
        data[9] *= s;
        data[10] *= s;
        data[11] *= s;

        data[12] *= s;
        data[13] *= s;
        data[14] *= s;
        data[15] *= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Matrix const& rhs) const {
        Matrix result;

        result.data[0] = data[0] / rhs.data[0];
        result.data[1] = data[1] / rhs.data[1];
        result.data[2] = data[2] / rhs.data[2];
        result.data[3] = data[3] / rhs.data[3];

        result.data[4] = data[4] / rhs.data[4];
        result.data[5] = data[5] / rhs.data[5];
        result.data[6] = data[6] / rhs.data[6];
        result.data[7] = data[7] / rhs.data[7];

        result.data[8] = data[8] / rhs.data[8];
        result.data[9] = data[9] / rhs.data[9];
        result.data[10] = data[10] / rhs.data[10];
        result.data[11] = data[11] / rhs.data[11];

        result.data[12] = data[12] / rhs.data[12];
        result.data[13] = data[13] / rhs.data[13];
        result.data[14] = data[14] / rhs.data[14];
        result.data[15] = data[15] / rhs.data[15];

        return result;
    }

    /// Scalar divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix divide(Element const& s) const {
        Matrix result;

        result.data[0] = data[0] / s;
        result.data[1] = data[1] / s;
        result.data[2] = data[2] / s;
        result.data[3] = data[3] / s;

        result.data[4] = data[4] / s;
        result.data[5] = data[5] / s;
        result.data[6] = data[6] / s;
        result.data[7] = data[7] / s;

        result.data[8] = data[8] / s;
        result.data[9] = data[9] / s;
        result.data[10] = data[10] / s;
        result.data[11] = data[11] / s;

        result.data[12] = data[12] / s;
        result.data[13] = data[13] / s;
        result.data[14] = data[14] / s;
        result.data[15] = data[15] / s;

        return result;
    }

    /// Scalar divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Element const& s) const { return divide(s); }

    /// Scalar divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Element const& s) {
        data[0] /= s;
        data[1] /= s;
        data[2] /= s;
        data[3] /= s;

        data[4] /= s;
        data[5] /= s;
        data[6] /= s;
        data[7] /= s;

        data[8] /= s;
        data[9] /= s;
        data[10] /= s;
        data[11] /= s;

        data[12] /= s;
        data[13] /= s;
        data[14] /= s;
        data[15] /= s;

        return *this;
    }

    /// Elementwise divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix operator/(Matrix const& rhs) const { return divide(rhs); }

    /// Elementwise divide operator (4-by-4)
    CUTLASS_HOST_DEVICE
    Matrix& operator/=(Matrix const& rhs) {
        data[0] /= rhs.data[0];
        data[1] /= rhs.data[1];
        data[2] /= rhs.data[2];
        data[3] /= rhs.data[3];

        data[4] /= rhs.data[4];
        data[5] /= rhs.data[5];
        data[6] /= rhs.data[6];
        data[7] /= rhs.data[7];

        data[8] /= rhs.data[8];
        data[9] /= rhs.data[9];
        data[10] /= rhs.data[10];
        data[11] /= rhs.data[11];

        data[12] /= rhs.data[12];
        data[13] /= rhs.data[13];
        data[14] /= rhs.data[14];
        data[15] /= rhs.data[15];

        return *this;
    }

    /// Negates each element of the matrix
    CUTLASS_HOST_DEVICE
    Matrix operator-() const {
        Matrix m;

        m.data[0] = -m.data[0];
        m.data[1] = -m.data[1];
        m.data[2] = -m.data[2];
        m.data[3] = -m.data[3];
        m.data[4] = -m.data[4];
        m.data[5] = -m.data[5];
        m.data[6] = -m.data[6];
        m.data[7] = -m.data[7];
        m.data[8] = -m.data[8];
        m.data[9] = -m.data[9];
        m.data[10] = -m.data[10];
        m.data[11] = -m.data[11];
        m.data[12] = -m.data[12];
        m.data[13] = -m.data[13];
        m.data[14] = -m.data[14];
        m.data[15] = -m.data[15];

        return m;
    }

    /// Matrix product of size 4-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> product(
            Matrix<Element, 4, 1> const& rhs,
            Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[4] * rhs.data[0];
        accum.data[2] += data[8] * rhs.data[0];
        accum.data[3] += data[12] * rhs.data[0];

        // k=1
        accum.data[0] += data[1] * rhs.data[1];
        accum.data[1] += data[5] * rhs.data[1];
        accum.data[2] += data[9] * rhs.data[1];
        accum.data[3] += data[13] * rhs.data[1];

        // k=2
        accum.data[0] += data[2] * rhs.data[2];
        accum.data[1] += data[6] * rhs.data[2];
        accum.data[2] += data[10] * rhs.data[2];
        accum.data[3] += data[14] * rhs.data[2];

        // k=3
        accum.data[0] += data[3] * rhs.data[3];
        accum.data[1] += data[7] * rhs.data[3];
        accum.data[2] += data[11] * rhs.data[3];
        accum.data[3] += data[15] * rhs.data[3];

        return accum;
    }

    /// Matrix product of size 4-by-1-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 1> operator*(Matrix<Element, 4, 1> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> product(
            Matrix<Element, 4, 2> const& rhs,
            Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[4] * rhs.data[0];
        accum.data[3] += data[4] * rhs.data[1];
        accum.data[4] += data[8] * rhs.data[0];
        accum.data[5] += data[8] * rhs.data[1];
        accum.data[6] += data[12] * rhs.data[0];
        accum.data[7] += data[12] * rhs.data[1];

        // k=1
        accum.data[0] += data[1] * rhs.data[2];
        accum.data[1] += data[1] * rhs.data[3];
        accum.data[2] += data[5] * rhs.data[2];
        accum.data[3] += data[5] * rhs.data[3];
        accum.data[4] += data[9] * rhs.data[2];
        accum.data[5] += data[9] * rhs.data[3];
        accum.data[6] += data[13] * rhs.data[2];
        accum.data[7] += data[13] * rhs.data[3];

        // k=2
        accum.data[0] += data[2] * rhs.data[4];
        accum.data[1] += data[2] * rhs.data[5];
        accum.data[2] += data[6] * rhs.data[4];
        accum.data[3] += data[6] * rhs.data[5];
        accum.data[4] += data[10] * rhs.data[4];
        accum.data[5] += data[10] * rhs.data[5];
        accum.data[6] += data[14] * rhs.data[4];
        accum.data[7] += data[14] * rhs.data[5];

        // k=3
        accum.data[0] += data[3] * rhs.data[6];
        accum.data[1] += data[3] * rhs.data[7];
        accum.data[2] += data[7] * rhs.data[6];
        accum.data[3] += data[7] * rhs.data[7];
        accum.data[4] += data[11] * rhs.data[6];
        accum.data[5] += data[11] * rhs.data[7];
        accum.data[6] += data[15] * rhs.data[6];
        accum.data[7] += data[15] * rhs.data[7];

        return accum;
    }

    /// Matrix product of size 4-by-2-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 2> operator*(Matrix<Element, 4, 2> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> product(
            Matrix<Element, 4, 3> const& rhs,
            Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[4] * rhs.data[0];
        accum.data[4] += data[4] * rhs.data[1];
        accum.data[5] += data[4] * rhs.data[2];
        accum.data[6] += data[8] * rhs.data[0];
        accum.data[7] += data[8] * rhs.data[1];
        accum.data[8] += data[8] * rhs.data[2];
        accum.data[9] += data[12] * rhs.data[0];
        accum.data[10] += data[12] * rhs.data[1];
        accum.data[11] += data[12] * rhs.data[2];

        // k=1
        accum.data[0] += data[1] * rhs.data[3];
        accum.data[1] += data[1] * rhs.data[4];
        accum.data[2] += data[1] * rhs.data[5];
        accum.data[3] += data[5] * rhs.data[3];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];
        accum.data[6] += data[9] * rhs.data[3];
        accum.data[7] += data[9] * rhs.data[4];
        accum.data[8] += data[9] * rhs.data[5];
        accum.data[9] += data[13] * rhs.data[3];
        accum.data[10] += data[13] * rhs.data[4];
        accum.data[11] += data[13] * rhs.data[5];

        // k=2
        accum.data[0] += data[2] * rhs.data[6];
        accum.data[1] += data[2] * rhs.data[7];
        accum.data[2] += data[2] * rhs.data[8];
        accum.data[3] += data[6] * rhs.data[6];
        accum.data[4] += data[6] * rhs.data[7];
        accum.data[5] += data[6] * rhs.data[8];
        accum.data[6] += data[10] * rhs.data[6];
        accum.data[7] += data[10] * rhs.data[7];
        accum.data[8] += data[10] * rhs.data[8];
        accum.data[9] += data[14] * rhs.data[6];
        accum.data[10] += data[14] * rhs.data[7];
        accum.data[11] += data[14] * rhs.data[8];

        // k=3
        accum.data[0] += data[3] * rhs.data[9];
        accum.data[1] += data[3] * rhs.data[10];
        accum.data[2] += data[3] * rhs.data[11];
        accum.data[3] += data[7] * rhs.data[9];
        accum.data[4] += data[7] * rhs.data[10];
        accum.data[5] += data[7] * rhs.data[11];
        accum.data[6] += data[11] * rhs.data[9];
        accum.data[7] += data[11] * rhs.data[10];
        accum.data[8] += data[11] * rhs.data[11];
        accum.data[9] += data[15] * rhs.data[9];
        accum.data[10] += data[15] * rhs.data[10];
        accum.data[11] += data[15] * rhs.data[11];

        return accum;
    }

    /// Matrix product of size 4-by-3-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 3> operator*(Matrix<Element, 4, 3> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> product(
            Matrix<Element, 4, 4> const& rhs,
            Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()) const {
        // k=0
        accum.data[0] += data[0] * rhs.data[0];
        accum.data[1] += data[0] * rhs.data[1];
        accum.data[2] += data[0] * rhs.data[2];
        accum.data[3] += data[0] * rhs.data[3];
        accum.data[4] += data[4] * rhs.data[0];
        accum.data[5] += data[4] * rhs.data[1];
        accum.data[6] += data[4] * rhs.data[2];
        accum.data[7] += data[4] * rhs.data[3];
        accum.data[8] += data[8] * rhs.data[0];
        accum.data[9] += data[8] * rhs.data[1];
        accum.data[10] += data[8] * rhs.data[2];
        accum.data[11] += data[8] * rhs.data[3];
        accum.data[12] += data[12] * rhs.data[0];
        accum.data[13] += data[12] * rhs.data[1];
        accum.data[14] += data[12] * rhs.data[2];
        accum.data[15] += data[12] * rhs.data[3];

        // k=1
        accum.data[0] += data[1] * rhs.data[4];
        accum.data[1] += data[1] * rhs.data[5];
        accum.data[2] += data[1] * rhs.data[6];
        accum.data[3] += data[1] * rhs.data[7];
        accum.data[4] += data[5] * rhs.data[4];
        accum.data[5] += data[5] * rhs.data[5];
        accum.data[6] += data[5] * rhs.data[6];
        accum.data[7] += data[5] * rhs.data[7];
        accum.data[8] += data[9] * rhs.data[4];
        accum.data[9] += data[9] * rhs.data[5];
        accum.data[10] += data[9] * rhs.data[6];
        accum.data[11] += data[9] * rhs.data[7];
        accum.data[12] += data[13] * rhs.data[4];
        accum.data[13] += data[13] * rhs.data[5];
        accum.data[14] += data[13] * rhs.data[6];
        accum.data[15] += data[13] * rhs.data[7];

        // k=2
        accum.data[0] += data[2] * rhs.data[8];
        accum.data[1] += data[2] * rhs.data[9];
        accum.data[2] += data[2] * rhs.data[10];
        accum.data[3] += data[2] * rhs.data[11];
        accum.data[4] += data[6] * rhs.data[8];
        accum.data[5] += data[6] * rhs.data[9];
        accum.data[6] += data[6] * rhs.data[10];
        accum.data[7] += data[6] * rhs.data[11];
        accum.data[8] += data[10] * rhs.data[8];
        accum.data[9] += data[10] * rhs.data[9];
        accum.data[10] += data[10] * rhs.data[10];
        accum.data[11] += data[10] * rhs.data[11];
        accum.data[12] += data[14] * rhs.data[8];
        accum.data[13] += data[14] * rhs.data[9];
        accum.data[14] += data[14] * rhs.data[10];
        accum.data[15] += data[14] * rhs.data[11];

        // k=3
        accum.data[0] += data[3] * rhs.data[12];
        accum.data[1] += data[3] * rhs.data[13];
        accum.data[2] += data[3] * rhs.data[14];
        accum.data[3] += data[3] * rhs.data[15];
        accum.data[4] += data[7] * rhs.data[12];
        accum.data[5] += data[7] * rhs.data[13];
        accum.data[6] += data[7] * rhs.data[14];
        accum.data[7] += data[7] * rhs.data[15];
        accum.data[8] += data[11] * rhs.data[12];
        accum.data[9] += data[11] * rhs.data[13];
        accum.data[10] += data[11] * rhs.data[14];
        accum.data[11] += data[11] * rhs.data[15];
        accum.data[12] += data[15] * rhs.data[12];
        accum.data[13] += data[15] * rhs.data[13];
        accum.data[14] += data[15] * rhs.data[14];
        accum.data[15] += data[15] * rhs.data[15];

        return accum;
    }

    /// Matrix product of size 4-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix<Element, 4, 4> operator*(Matrix<Element, 4, 4> const& rhs) const {
        return product(rhs);
    }

    /// Matrix product of size 4-by-4-by-4
    CUTLASS_HOST_DEVICE
    Matrix& operator*=(Matrix<Element, 4, 4> const& rhs) {
        *this = product(rhs);
        return *this;
    }

    /// Returns the sum of elements
    CUTLASS_HOST_DEVICE
    Element sum(Element accum = Element()) const {
        accum += data[0];
        accum += data[1];
        accum += data[2];
        accum += data[3];
        accum += data[4];
        accum += data[5];
        accum += data[6];
        accum += data[7];
        accum += data[8];
        accum += data[9];
        accum += data[10];
        accum += data[11];
        accum += data[12];
        accum += data[13];
        accum += data[14];
        accum += data[15];

        return accum;
    }

    /// Returns the sum of squared elements
    CUTLASS_HOST_DEVICE
    Element norm(Element accum = Element()) const {
        accum += data[0] * data[0];
        accum += data[1] * data[1];
        accum += data[2] * data[2];
        accum += data[3] * data[3];
        accum += data[4] * data[4];
        accum += data[5] * data[5];
        accum += data[6] * data[6];
        accum += data[7] * data[7];
        accum += data[8] * data[8];
        accum += data[9] * data[9];
        accum += data[10] * data[10];
        accum += data[11] * data[11];
        accum += data[12] * data[12];
        accum += data[13] * data[13];
        accum += data[14] * data[14];
        accum += data[15] * data[15];

        return accum;
    }

    /// Returns square root of the norm
    CUTLASS_HOST_DEVICE
    Element magnitude() const { return fast_sqrt(norm()); }

    /// Returns the sum of diagonal elements
    CUTLASS_HOST_DEVICE
    Element trace(Element accum = Element()) const {
        accum += data[0];
        accum += data[5];
        accum += data[10];
        accum += data[15];

        return accum;
    }

    /// Returns 4-by-4 rotation matrix around the X axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_X(Element theta) {
        Matrix m = identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(1, 1) = c;
        m.at(1, 2) = -s;
        m.at(2, 1) = s;
        m.at(2, 2) = c;

        return m;
    }

    /// Returns 4-by-4 rotation matrix around the Y axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_Y(Element theta) {
        Matrix m = identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(0, 0) = c;
        m.at(2, 0) = -s;
        m.at(0, 2) = s;
        m.at(2, 2) = c;

        return m;
    }

    /// Returns 4-by-4 rotation matrix around the Z axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation_Z(Element theta) {
        Matrix m = Matrix::identity();

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        m.at(0, 0) = c;
        m.at(0, 1) = -s;
        m.at(1, 0) = s;
        m.at(1, 1) = c;

        return m;
    }

    /// Returns a 4-by-4 rotation matrix around a unit-length axis
    CUTLASS_HOST_DEVICE
    static Matrix rotation(Element theta, Matrix<Element, 3, 1> const& u) {
        Element x = u.data[0];
        Element y = u.data[1];
        Element z = u.data[2];

        Element c = fast_cos(theta);
        Element s = fast_sin(theta);

        Element one_minus_cos = Element(1) - fast_cos(theta);

        Matrix m;

        m.set_slice3x3(
                {c + x * x * one_minus_cos, x * y * one_minus_cos - z * s,
                 x * z * one_minus_cos + y * s, y * x * one_minus_cos * z * s,
                 c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
                 z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s,
                 c + z * z * one_minus_cos});

        return m;
    }

    /// Returns a 4-by-4 reflection about the plane specified by the
    /// unit-length normal vector n_unit
    CUTLASS_HOST_DEVICE
    static Matrix reflection(Matrix<Element, 3, 1> const& n_unit) {
        Element a = n_unit.data[0];
        Element b = n_unit.data[1];
        Element c = n_unit.data[2];

        Matrix m = Matrix::identity();

        m.set_slice3x3({Element(1) - Element(2) * a * a, Element(-2) * a * b,
                        Element(-2) * a * c, Element(-2) * a * b,
                        Element(1) - Element(2) * b * b, Element(-2) * b * c,
                        Element(-2) * a * c, Element(-2) * b * c,
                        Element(1) - Element(2) * c * c});

        return m;
    }

    /// Returns a perspective projection matrix typical of OpenGL applications
    CUTLASS_HOST_DEVICE
    static Matrix perspective(Element near, Element far, Element fovH,
                              Element fovV) {
        Element aspect = fovH / fovV;
        Element f = Element(cos(fovV)) / Element(fovH);
        Element Q = near - far;

        return Matrix(f / aspect, 0, 0, 0, 0, f, 0, 0, 0, 0, (near + far) / Q,
                      Element(2) * far * near / Q, 0, 0, -1, 0);
    }

    CUTLASS_HOST_DEVICE
    static Matrix translation(Matrix<Element, 3, 1> const& v) {
        return Matrix(1, 0, 0, v.data[0], 0, 1, 0, v.data[1], 0, 0, 1,
                      v.data[2], 0, 0, 0, 1);
    }

    /// Computes the determinant of a 4-by-4 matrix
    CUTLASS_HOST_DEVICE
    Element determinant(Element accum = Element()) const {
        accum +=
                at(0, 0) * Matrix<Element, 3, 3>({at(1, 1), at(1, 2), at(1, 3),
                                                  at(2, 1), at(2, 2), at(2, 3),
                                                  at(3, 1), at(3, 2), at(3, 3)})
                                   .determinant();
        accum -=
                at(0, 1) * Matrix<Element, 3, 3>({at(1, 0), at(1, 2), at(1, 3),
                                                  at(2, 0), at(2, 2), at(2, 3),
                                                  at(3, 0), at(3, 2), at(3, 3)})
                                   .determinant();
        accum +=
                at(0, 2) * Matrix<Element, 3, 3>({at(1, 0), at(1, 1), at(1, 3),
                                                  at(2, 0), at(2, 1), at(2, 3),
                                                  at(3, 0), at(3, 1), at(3, 3)})
                                   .determinant();
        accum -=
                at(0, 3) * Matrix<Element, 3, 3>({at(1, 0), at(1, 1), at(1, 2),
                                                  at(2, 0), at(2, 1), at(2, 2),
                                                  at(3, 0), at(3, 1), at(3, 2)})
                                   .determinant();

        return accum;
    }

    /// Computes the inverse of a 4-by-4 matrix (ignores the optional argument)
    CUTLASS_HOST_DEVICE
    Matrix inverse(Element ignore = 1) const {
        Matrix<Element, 2, 2> B = slice_2x2(0, 2);
        Matrix<Element, 2, 2> A = slice_2x2(0, 0);
        Matrix<Element, 2, 2> C = slice_2x2(2, 0);
        Matrix<Element, 2, 2> D = slice_2x2(2, 2);

        Matrix<Element, 2, 2> D_inv = D.inverse();

        Matrix<Element, 2, 2> E = (A - B * D_inv * C).inverse();

        return Matrix::block(E, -E * B * D_inv, -D_inv * C * E,
                             D_inv + D_inv * C * E * B * D_inv);
    }
};

/// Template alias for 4-by-4 matrix
template <typename Element>
using Matrix4x4 = Matrix<Element, 4, 4>;

/// Free funciton to infer element type from template arguments
template <typename Element>
CUTLASS_HOST_DEVICE Matrix4x4<Element> make_Matrix4x4(
        Element _0_0, Element _0_1, Element _0_2, Element _0_3, Element _1_0,
        Element _1_1, Element _1_2, Element _1_3, Element _2_0, Element _2_1,
        Element _2_2, Element _2_3, Element _3_0, Element _3_1, Element _3_2,
        Element _3_3) {
    return Matrix4x4<Element>(_0_0, _0_1, _0_2, _0_3, _1_0, _1_1, _1_2, _1_3,
                              _2_0, _2_1, _2_2, _2_3, _3_0, _3_1, _3_2, _3_3);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Elementwise scalar multiplication
template <typename Element, int Rows, int Columns>
CUTLASS_HOST_DEVICE Matrix<Element, Rows, Columns> operator*(
        Element s, Matrix<Element, Rows, Columns> const& rhs) {
    return rhs.multiply(s);
}

/// Prints matrix to ostream
template <typename Element, int Rows, int Columns>
std::ostream& operator<<(std::ostream& out,
                         Matrix<Element, Rows, Columns> const& rhs) {
    for (int i = 0; i < Rows; ++i) {
        for (int j = 0; j < Columns; ++j) {
            out << (j ? ", " : "") << rhs.at(i, j);
        }
        out << "\n";
    }

    return out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace cutlass

/////////////////////////////////////////////////////////////////////////////////////////////////
